### Yを予測するための効率的なトレーニングデータの抽出検討

In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openai
import openai_api_key
import pubchempy
import random
 
from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, rdMHFPFingerprint
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions

In [277]:
#Buchwald-Hartwig Reaction datasetの読み込み
pb = "./practice/ord_dataset-00005539a1e04c809a9a78647bea649c.pb.gz"  
data = message_helpers.load_message(pb, dataset_pb2.Dataset) #生データであるjson型式のデータファイルとして読み取っている?
df = message_helpers.messages_to_dataframe(data.reactions, \
                                           drop_constant_columns=False) #jsonからdataframeへ変換

#元dfからA+B→Yとなる部分だけ抜き出してDataFrameにする
df = df[[ \
    'inputs["aryl halide"].components[0].identifiers[0].value', \
    'inputs["amine"].components[0].identifiers[0].value', \
    'outcomes[0].products[0].identifiers[0].value' \
    ]] 

df.columns = list('ABY') #列ラベルをわかりやすく A,B,Yに変換
df_buchwald_hardwig_smiles = df.drop_duplicates()\
                                .reset_index()\
                                .drop(columns=["index"])

#SMILESからMOLオブジェクトを生成する関数を定義
def generate_mol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        return None

# dfのすべてのA,B 要素に対してMolオブジェクトを生成する
df_mol_ABY = df_buchwald_hardwig_smiles.applymap(lambda smiles: generate_mol(smiles))\
                                        .rename(columns={"A": "mol_A", "B": "mol_B", "Y": "mol_Y"})

df_smiles_mol = pd.concat([df_buchwald_hardwig_smiles, df_mol_ABY], axis=1)
df_smiles_mol


Unnamed: 0,A,B,Y,mol_A,mol_B,mol_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,<rdkit.Chem.rdchem.Mol object at 0x1621fd150>,<rdkit.Chem.rdchem.Mol object at 0x1513f43c0>,<rdkit.Chem.rdchem.Mol object at 0x1514ca5e0>
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,<rdkit.Chem.rdchem.Mol object at 0x1621fc2e0>,<rdkit.Chem.rdchem.Mol object at 0x1513f5d90>,<rdkit.Chem.rdchem.Mol object at 0x1514ca650>
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,<rdkit.Chem.rdchem.Mol object at 0x15131ba00>,<rdkit.Chem.rdchem.Mol object at 0x1513f4430>,<rdkit.Chem.rdchem.Mol object at 0x1514ca7a0>
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,<rdkit.Chem.rdchem.Mol object at 0x151318820>,<rdkit.Chem.rdchem.Mol object at 0x1513f66c0>,<rdkit.Chem.rdchem.Mol object at 0x1514cb370>
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,<rdkit.Chem.rdchem.Mol object at 0x151319fc0>,<rdkit.Chem.rdchem.Mol object at 0x1513f5690>,<rdkit.Chem.rdchem.Mol object at 0x1514cb0d0>
...,...,...,...,...,...,...
468,C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)N4C=NC...,C1CNCCN1,C1CN(CCN1)C2=CC=CC3=C2N=CN3C(C4=CC=CC=C4)(C5=C...,<rdkit.Chem.rdchem.Mol object at 0x161f427a0>,<rdkit.Chem.rdchem.Mol object at 0x1514cae30>,<rdkit.Chem.rdchem.Mol object at 0x151565460>
469,C1=CC(=C(C=C1C(F)(F)F)Br)F,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C(C=CC(=C2)C(F)(F)F)F,<rdkit.Chem.rdchem.Mol object at 0x161f40f90>,<rdkit.Chem.rdchem.Mol object at 0x1514c9bd0>,<rdkit.Chem.rdchem.Mol object at 0x1515657e0>
470,C1=CN=C(C=C1C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=NC=CC(=C2)C(F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x1513f4890>,<rdkit.Chem.rdchem.Mol object at 0x1514c9b60>,<rdkit.Chem.rdchem.Mol object at 0x151567530>
471,C1=CC2=C(C=CC(=C2N=C1)OS(=O)(=O)C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C3C(=C(C=C2)Cl)C=CC=N3,<rdkit.Chem.rdchem.Mol object at 0x1513f6810>,<rdkit.Chem.rdchem.Mol object at 0x1514c9a80>,<rdkit.Chem.rdchem.Mol object at 0x151566ce0>


In [278]:
newline = '\n'
df_buchwald_hardwig_smiles.insert(3, "n", newline)

In [281]:
df_smiles_stacked = df_buchwald_hardwig_smiles.stack()
df_smiles_stacked

0    A    CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...
     B                                        CC(C)N1CCNCC1
     Y    CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...
     n                                                   \n
1    A                                       C1=CC=C(C=C1)I
                                ...                        
471  n                                                   \n
472  A                     C1CC1NC2=CC(=NC3=C(C=NN23)C#N)Cl
     B                  CC(=O)NC1=C(C=CC(=C1)N)N(C)CCCN(C)C
     Y    CC(=O)NC1=C(C=CC(=C1)NC2=NC3=C(C=NN3C(=C2)NC4C...
     n                                                   \n
Length: 1892, dtype: object

In [266]:
df_smiles_mol_transpose = df_buchwald_hardwig_smiles_n.transpose()
df_smiles_mol_transpose

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,463,464,465,466,467,468,469,470,471,472
A,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,C1=CC=C(C=C1)I,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=C(C=NC=C1Br)Br,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,CC(=O)N1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CN=CC=C3,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CN=CC=C3,C1=CN=C(C=C1F)Cl,C1=CC2=NC=C(N2C=C1)C3=NC(=NC=C3Cl)Cl,C1COCCN1CC2=CC(=C(C=C2)Br)F,...,C1=C(C=C(C(=C1F)F)Br)F,CN1C=CC(=N1)NC2=CC(=NC3=C(C=NN23)C#N)Cl,C1=CC(=C(C=C1Br)F)C(F)(F)F,C1=C(C=C(C=C1Cl)Br)F,CC(C)(C)OC(=O)C1=CC=C(C=C1)Br,C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)N4C=NC...,C1=CC(=C(C=C1C(F)(F)F)Br)F,C1=CN=C(C=C1C(F)(F)F)Cl,C1=CC2=C(C=CC(=C2N=C1)OS(=O)(=O)C(F)(F)F)Cl,C1CC1NC2=CC(=NC3=C(C=NN23)C#N)Cl
B,CC(C)N1CCNCC1,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,C1=CC(=CC=C1N)S(=O)(=O)N,CC(=O)N1CCNCC1,C1=CC(=CC=C1N)N2C=CN=C2,CC1=CN(C=N1)C2=C(C=C(C=C2)N)OC,CC1=CN(C=N1)C2=C(C=C(C=C2)N)OC,CC(C)(C)OC(=O)N,COC1=C(C=CC(=C1)C=O)N,CC1=NC=C(N1C2CCOCC2)C3=NC(=NC=C3F)N,...,CC(C1=CC(=CC2=C1OC(=CC2=O)N3CCOCC3)C(=O)N(C)C)N,CC1=C(C=C(C=C1)N)NC(=O)C,CC(C)(C)OC(=O)N1CCNCC1,CC(C1=CC(=CC2=C1OC(=CC2=O)N3CCOCC3)C(=O)N(C)C)N,CCOC(=O)C1CCNCC1,C1CNCCN1,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCNCC1,CC(=O)NC1=C(C=CC(=C1)N)N(C)CCCN(C)C
Y,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,CC1=CN(C=N1)C2=C(C=C(C=C2)NC3=NC4=C(CN(CC(O4)C...,CC1=CN(C=N1)C2=C(C=C(C=C2)NC3=NC4=C(CN(CC(O4)C...,CC(C)(C)OC(=O)NC1=NC=CC(=C1)F,COC1=C(C=CC(=C1)C=O)NC2=NC=C(C(=N2)C3=CN=C4N3C...,CC1=NC=C(N1C2CCOCC2)C3=NC(=NC=C3F)NC4=C(C=C(C=...,...,CC(C1=CC(=CC2=C1OC(=CC2=O)N3CCOCC3)C(=O)N(C)C)...,CC1=C(C=C(C=C1)NC2=NC3=C(C=NN3C(=C2)NC4=NN(C=C...,CC(C)(C)OC(=O)N1CCN(CC1)C2=CC(=C(C=C2)C(F)(F)F)F,CC(C1=CC(=CC2=C1OC(=CC2=O)N3CCOCC3)C(=O)N(C)C)...,CCOC(=O)C1CCN(CC1)C2=CC=C(C=C2)C(=O)OC(C)(C)C,C1CN(CCN1)C2=CC=CC3=C2N=CN3C(C4=CC=CC=C4)(C5=C...,CC(C)(C)OC(=O)N1CCN(CC1)C2=C(C=CC(=C2)C(F)(F)F)F,CC(C)(C)OC(=O)N1CCN(CC1)C2=NC=CC(=C2)C(F)(F)F,CC(C)(C)OC(=O)N1CCN(CC1)C2=C3C(=C(C=C2)Cl)C=CC=N3,CC(=O)NC1=C(C=CC(=C1)NC2=NC3=C(C=NN3C(=C2)NC4C...
n,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n,...,\n,\n,\n,\n,\n,\n,\n,\n,\n,\n


In [267]:
df_smiles_mol_transpose_stack = df_smiles_mol_transpose.stack()
df_smiles_mol_transpose_stack

A  0      CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...
   1                                         C1=CC=C(C=C1)I
   2                      CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C
   3                                      C1=C(C=NC=C1Br)Br
   4                 CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3
                                ...                        
n  468                                                   \n
   469                                                   \n
   470                                                   \n
   471                                                   \n
   472                                                   \n
Length: 1892, dtype: object

## Buchwald-Hartwig Reaction データセット の FingerPrint　と　類似性

タニモト係数

Tanimoto Coefficient = c / (a + b - c)

・aは分子Aのビット配列で1が立っている数

・bは分子Bのビット配列で1が立っている数

・cは分子AとBで共通に1が立っている数

In [221]:
# 以下の化合物をテストデータとしてデータセットの各化合物とタニモト係数を計算する
#reactant_A = "CC1=NN(C=C1NC2=NC=C(C(=C2)I)C(F)(F)F)C"
#reactant_B = "CONC(=O)C1=CC=CC=C1N"

#mol_A = random.choice(sr_mol_A)
#mol_B = random.choice(sr_mol_B)

#Fingerprintから対象のテスト分子に対するタニモト係数を計算する関数を定義 
def calc_fps_tanimoto(test_fps, fps):
    fps_tnmt_coeffi = DataStructs.TanimotoSimilarity(test_fps, fps)
    return fps_tnmt_coeffi

1. MACCS Keys

    AllChem.GetMACCSKeysFingerprint(mol)

    166の部分構造について部分構造を有する場合は1が無い場合は0が格納される

In [222]:
#MACCAS Keys Fingerprintを生成する関数を定義 
def generate_maccs_fps(mol):
    maccs_fps = AllChem.GetMACCSKeysFingerprint(mol)
    return maccs_fps

#df_molの各要素に対して maccs_fps を生成
df_maccs_fps_ABY = df_mol_ABY.applymap(lambda mol: generate_maccs_fps(mol))\
                                        .rename(columns={"mol_A": "maccs_fps_A", "mol_B": "maccs_fps_B", "mol_Y": "maccs_fps_Y"})
df_smiles_maccs_fps = pd.concat([df_buchwald_hardwig_smiles, df_maccs_fps_ABY], axis=1)
df_smiles_maccs_fps.head()

Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [223]:
#テスト分子ABのfpsをランダムに抜き出す
test_data_row = df_smiles_maccs_fps.iloc[45, :]
test_A_maccs_fps = test_data_row["maccs_fps_A"]
test_B_maccs_fps = test_data_row["maccs_fps_B"]
test_Y_maccs_fps = test_data_row["maccs_fps_Y"]
li_test_AB = []

df_smiles_maccs_fps_copy = df_smiles_maccs_fps.copy()
#df_maccs_fps の各要素に対して タニモト係数 を生成
df_smiles_maccs_fps_copy["tnmt_A"] = df_maccs_fps_ABY["maccs_fps_A"].apply(lambda maccs_fps: calc_fps_tanimoto(test_A_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_B"] = df_maccs_fps_ABY["maccs_fps_B"].apply(lambda maccs_fps: calc_fps_tanimoto(test_B_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_Y"] = df_maccs_fps_ABY["maccs_fps_Y"].apply(lambda maccs_fps: calc_fps_tanimoto(test_Y_maccs_fps, maccs_fps))
df_smiles_maccs_fps_tnmt = df_smiles_maccs_fps_copy

#df = df_smiles_maccs_fps_tnmt
#df_test_AB_row = df.loc[(df["maccs_fps_A"] == test_A_maccs_fps) \
                                         # & (df["maccs_fps_B"] == test_B_maccs_fps)]

df_smiles_maccs_fps_tnmt.head()


Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y,tnmt_A,tnmt_B,tnmt_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.212766,0.717949,0.529412
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.461538,0.327586,0.473684
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.3125,0.126984,0.190476
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.5,0.590909,0.596154
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.183673,0.340909,0.421875


In [224]:
#残骸
sr_mol_A = df_buchwald_hardwig_smiles["A"].apply(lambda smiles: generate_mol(smiles))
sr_mol_B = df_buchwald_hardwig_smiles["B"].apply(lambda smiles: generate_mol(smiles))

#sr_molの各要素に対して maccs_fps を生成
sr_maccs_fps_A = sr_mol_A.apply(lambda mol: generate_maccs_fps(mol))
sr_maccs_fps_B = sr_mol_B.apply(lambda mol: generate_maccs_fps(mol))

#sr_maccs_fpsの各要素に対して タニモト係数 を計算
sr_maccs_fps_tnmt_A = sr_maccs_fps_A.apply(lambda fps: calc_fps_tanimoto(test_A_maccs_fps, fps) \
                                           if fps != test_A_maccs_fps else None).dropna()
sr_maccs_fps_tnmt_B = sr_maccs_fps_B.apply(lambda fps: calc_fps_tanimoto(test_B_maccs_fps, fps)  \
                                           if fps != test_B_maccs_fps else None).dropna()

#print(sr_mol_A.describe())
#print(sr_mol_B.describe())


#buchwald_hardwig_smiles_df.insert(0, ":", ":")
#buchwald_hardwig_smiles_df.insert(2, "+", "+")
#buchwald_hardwig_smiles_df.insert(4, "→", "→")
#buchwald_hardwig_smiles_df.insert(6, "\", "\")
#buchwald_hardwig_smiles_df

2. Topologicalフィンガープリント (RDKitフィンガープリント)

    Chem.RDKFingerprint(mol)

    一定の結合数に相当する原子と結合種類を格納(?)

In [225]:
#Topological(Rdkit) Fingerprintを生成する関数を定義 
def generate_rdkit_fps(mol):
    rdkit_fps = Chem.RDKFingerprint(mol)
    return rdkit_fps

#df_molの各要素に対して rdkit_fps を生成
sr_rdkit_fps_A = sr_mol_A.apply(lambda mol: generate_rdkit_fps(mol))
sr_rdkit_fps_B = sr_mol_B.apply(lambda mol: generate_rdkit_fps(mol))
#rdkit_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_rdkit_fps = random.choice(sr_rdkit_fps_A)
test_B_rdkit_fps = random.choice(sr_rdkit_fps_B)

#sr_rdkit_fpsの各要素に対して タニモト係数 を計算
sr_rdkit_fps_tnmt_A = sr_rdkit_fps_A.apply(lambda fps: calc_fps_tanimoto(test_A_rdkit_fps, fps) \
                                           if fps != test_A_rdkit_fps else None).dropna()
sr_rdkit_fps_tnmt_B = sr_rdkit_fps_B.apply(lambda fps: calc_fps_tanimoto(test_B_rdkit_fps, fps) \
                                           if fps != test_B_rdkit_fps else None).dropna()

print(sr_rdkit_fps_tnmt_A.describe()) 
print(sr_rdkit_fps_tnmt_B.describe())

count    465.000000
mean       0.166600
std        0.107268
min        0.019231
25%        0.081212
50%        0.142077
75%        0.248390
max        0.932450
Name: A, dtype: float64
count    472.000000
mean       0.102754
std        0.061207
min        0.000000
25%        0.050186
50%        0.101266
75%        0.142972
max        0.283859
Name: B, dtype: float64


3. Morganフィンガープリント (Circularフィンガープリント)

    AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

    基準となる原子からある距離にある部分構造を数え上げていく

In [226]:
#Morgan Fingerprintを生成する関数を定義 
def generate_morgan_fps(mol, radius, nBits):
    morgan_fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return morgan_fps

#df_molの各要素に対して maccs_fps を生成
sr_morgan_fps_A = sr_mol_A.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
sr_morgan_fps_B = sr_mol_B.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
#morgan_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_morgan_fps = random.choice(sr_morgan_fps_A)
test_B_morgan_fps = random.choice(sr_morgan_fps_B)

#sr_morgan_fpsの各要素に対して タニモト係数 を計
sr_morgan_fps_tnmt_A = sr_morgan_fps_A.apply(lambda fps: calc_fps_tanimoto(test_A_morgan_fps, fps) \
                                           if fps != test_A_morgan_fps else None).dropna()
sr_morgan_fps_tnmt_B = sr_morgan_fps_B.apply(lambda fps: calc_fps_tanimoto(test_B_morgan_fps, fps) \
                                           if fps != test_B_morgan_fps else None).dropna()

print(sr_morgan_fps_tnmt_A.describe()) 
print(sr_morgan_fps_tnmt_B.describe())

count    472.000000
mean       0.122017
std        0.049253
min        0.056338
25%        0.090909
50%        0.116279
75%        0.139535
max        0.432432
Name: A, dtype: float64
count    449.000000
mean       0.078788
std        0.103687
min        0.000000
25%        0.000000
50%        0.025000
75%        0.151515
max        0.416667
Name: B, dtype: float64


## プロンプトの改善

In [227]:
test_A_smiles = test_data_row.loc["A"]
test_B_smiles = test_data_row["B"]

print(test_A_smiles)
print(test_B_smiles)

C1=CC(=C(C=C1Cl)Br)C#N
CN(C)[C@H]1CCNC1


In [287]:
from context2 import training_dataset
openai.api_key = ""

context = f"{training_dataset}\
            \
            test data:\
            A: {test_A_smiles}\
            B: {test_B_smiles}\
            Y1:\
            Y2:\
            Y3:\
            Y4:\
            Y5:\
            "

#question = "Answer at least five candidates for '?'. "

# Question-Answering
response = openai.Completion.create(
  engine="text-davinci-003",
  prompt=f"{context}",
  max_tokens=300,
  temperature=0.5,
  echo=False
)

answer = response.choices[0].text.strip()
print(response)


{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\nY1: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)Br)C#N\nY2: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)Br\nY3: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2Cl)Br)C#N\nY4: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2Cl)C#N)Br\nY5: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)OC)C#N"
    }
  ],
  "created": 1686958921,
  "id": "cmpl-7SDMnIP8oxAxOF4xp1TGKCZNQ05Pq",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 173,
    "prompt_tokens": 3018,
    "total_tokens": 3191
  }
}


In [229]:
df = df_smiles_maccs_fps_tnmt
df_test_AB_row = df.loc[(df["A"] == test_A_smiles) \
                                       & (df["B"] == test_B_smiles)]
df_test_AB_row

test_Y_smiles = df_test_AB_row.loc[45, "Y"]
test_Y_maccs_fps2 = df_test_AB_row.loc[45, "maccs_fps_Y"]

text = response["choices"][0]["text"]
start_index = text.index("Candidates for '?' :") + len("Candidates for '?' :")
result = text[start_index:].split("\n")
values = [x.strip()[3:] for x in result if x.strip()]

df_product_Y_candidates = pd.DataFrame({"Y_candidates":values})

df_product_Y_candidates["Y_candidates_mol"] = df_product_Y_candidates["Y_candidates"].\
                                                apply(lambda smiles: generate_mol(smiles))

df_product_Y_candidates["Y_candidates_maccs_fps"] = df_product_Y_candidates["Y_candidates_mol"].\
                                                apply(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))

df_product_Y_candidates["Y_candidates_tnmt"] = df_product_Y_candidates["Y_candidates_maccs_fps"].\
                                                apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps2, maccs_fps))
df_product_Y_candidates


ValueError: substring not found

In [None]:
#prompt=f"Question answering:\nContext: {context}\nQuestion: {question}",