### Yを予測するための効率的なトレーニングデータの抽出検討

In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openai
import openai_api_key
import pubchempy
import random
 
from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, rdMHFPFingerprint
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions

In [312]:
#Buchwald-Hartwig Reaction datasetの読み込み
pb = "./practice/ord_dataset-00005539a1e04c809a9a78647bea649c.pb.gz"  
data = message_helpers.load_message(pb, dataset_pb2.Dataset) #生データであるjson型式のデータファイルとして読み取っている?
df = message_helpers.messages_to_dataframe(data.reactions, \
                                           drop_constant_columns=False) #jsonからdataframeへ変換

#元dfからA+B→Yとなる部分だけ抜き出してDataFrameにする
df = df[[ \
    'inputs["aryl halide"].components[0].identifiers[0].value', \
    'inputs["amine"].components[0].identifiers[0].value', \
    'outcomes[0].products[0].identifiers[0].value' \
    ]] 

df.columns = list('ABY') #列ラベルをわかりやすく A,B,Yに変換
df_buchwald_hardwig_smiles = df.drop_duplicates()\
                                .reset_index()\
                                .drop(columns=["index"])

#SMILESからMOLオブジェクトを生成する関数を定義
def generate_mol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        return None

# dfのすべてのA,B 要素に対してMolオブジェクトを生成する
df_mol_ABY = df_buchwald_hardwig_smiles.applymap(lambda smiles: generate_mol(smiles))\
                                        .rename(columns={"A": "mol_A", "B": "mol_B", "Y": "mol_Y"})

df_smiles_mol = pd.concat([df_buchwald_hardwig_smiles, df_mol_ABY], axis=1)
df_buchwald_hardwig_smiles


Unnamed: 0,A,B,Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...
...,...,...,...
468,C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)N4C=NC...,C1CNCCN1,C1CN(CCN1)C2=CC=CC3=C2N=CN3C(C4=CC=CC=C4)(C5=C...
469,C1=CC(=C(C=C1C(F)(F)F)Br)F,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C(C=CC(=C2)C(F)(F)F)F
470,C1=CN=C(C=C1C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=NC=CC(=C2)C(F)(F)F
471,C1=CC2=C(C=CC(=C2N=C1)OS(=O)(=O)C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C3C(=C(C=C2)Cl)C=CC=N3


In [314]:
#ボツ\
'''
newline = '\n'
df_buchwald_hardwig_smiles.insert(3, "n", newline)
df_smiles_stacked = df_buchwald_hardwig_smiles.stack()
df_smiles_stacked
df_smiles_mol_transpose = df_buchwald_hardwig_smiles_n.transpose()
df_smiles_mol_transpose
df_smiles_mol_transpose_stack = df_smiles_mol_transpose.stack()
df_smiles_mol_transpose_stack
'''

'\nnewline = \'\n\'\ndf_buchwald_hardwig_smiles.insert(3, "n", newline)\ndf_smiles_stacked = df_buchwald_hardwig_smiles.stack()\ndf_smiles_stacked\ndf_smiles_mol_transpose = df_buchwald_hardwig_smiles_n.transpose()\ndf_smiles_mol_transpose\ndf_smiles_mol_transpose_stack = df_smiles_mol_transpose.stack()\ndf_smiles_mol_transpose_stack\n'

## Buchwald-Hartwig Reaction データセット の FingerPrint　と　類似性

タニモト係数

Tanimoto Coefficient = c / (a + b - c)

・aは分子Aのビット配列で1が立っている数

・bは分子Bのビット配列で1が立っている数

・cは分子AとBで共通に1が立っている数

DataStructs.TanimotoSimilarity(test_fps, fps)

FingerPrint

1. MACCS Keys

    AllChem.GetMACCSKeysFingerprint(mol)

    166の部分構造について部分構造を有する場合は1が無い場合は0が格納される

In [315]:
#df_molの各要素に対して maccs_fps を生成
df_maccs_fps_ABY = df_mol_ABY.applymap(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))\
                                        .rename(columns={"mol_A": "maccs_fps_A", "mol_B": "maccs_fps_B", "mol_Y": "maccs_fps_Y"})
df_smiles_maccs_fps = pd.concat([df_buchwald_hardwig_smiles, df_maccs_fps_ABY], axis=1)
df_smiles_maccs_fps.head()



Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [319]:
'''
df_dataset = df_smiles_maccs_fps
df_smiles_maccs_fps["tnmt_A"] = df_smiles_maccs_fps["maccs_fps_A"]\
                                .apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(maccs_fps, df_smiles_maccs_fps["maccs_fps_Y"]))
'''

'\ndf_dataset = df_smiles_maccs_fps\ndf_smiles_maccs_fps["tnmt_A"] = df_smiles_maccs_fps["maccs_fps_A"]                                .apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(maccs_fps, df_smiles_maccs_fps["maccs_fps_Y"]))\n'

In [316]:
#テスト分子ABのfpsをランダムに抜き出す
test_data_row = df_smiles_maccs_fps.iloc[45, :]
test_A_maccs_fps = test_data_row["maccs_fps_A"]
test_B_maccs_fps = test_data_row["maccs_fps_B"]
test_Y_maccs_fps = test_data_row["maccs_fps_Y"]

df_smiles_maccs_fps_copy = df_smiles_maccs_fps.copy()
#df_maccs_fps の各要素に対して タニモト係数 を生成
df_smiles_maccs_fps_copy["tnmt_A"] = df_maccs_fps_ABY["maccs_fps_A"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_A_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_B"] = df_maccs_fps_ABY["maccs_fps_B"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_B_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_Y"] = df_maccs_fps_ABY["maccs_fps_Y"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps, maccs_fps))
df_smiles_maccs_fps_tnmt = df_smiles_maccs_fps_copy

df_smiles_maccs_fps_tnmt.head()


Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y,tnmt_A,tnmt_B,tnmt_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.212766,0.717949,0.529412
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.461538,0.327586,0.473684
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.3125,0.126984,0.190476
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.5,0.590909,0.596154
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.183673,0.340909,0.421875


In [224]:
#ボツ
'''
sr_mol_A = df_buchwald_hardwig_smiles["A"].apply(lambda smiles: generate_mol(smiles))
sr_mol_B = df_buchwald_hardwig_smiles["B"].apply(lambda smiles: generate_mol(smiles))

#sr_molの各要素に対して maccs_fps を生成
sr_maccs_fps_A = sr_mol_A.apply(lambda mol: generate_maccs_fps(mol))
sr_maccs_fps_B = sr_mol_B.apply(lambda mol: generate_maccs_fps(mol))

#sr_maccs_fpsの各要素に対して タニモト係数 を計算
sr_maccs_fps_tnmt_A = sr_maccs_fps_A.apply(lambda fps: calc_fps_tanimoto(test_A_maccs_fps, fps) \
                                           if fps != test_A_maccs_fps else None).dropna()
sr_maccs_fps_tnmt_B = sr_maccs_fps_B.apply(lambda fps: calc_fps_tanimoto(test_B_maccs_fps, fps)  \
                                           if fps != test_B_maccs_fps else None).dropna()

#print(sr_mol_A.describe())
#print(sr_mol_B.describe())


#buchwald_hardwig_smiles_df.insert(0, ":", ":")
#buchwald_hardwig_smiles_df.insert(2, "+", "+")
#buchwald_hardwig_smiles_df.insert(4, "→", "→")
#buchwald_hardwig_smiles_df.insert(6, "\", "\")
#buchwald_hardwig_smiles_df
'''

2. Topologicalフィンガープリント (RDKitフィンガープリント)

    Chem.RDKFingerprint(mol)

    一定の結合数に相当する原子と結合種類を格納(?)

In [403]:
#Topological(Rdkit) Fingerprintを生成する関数を定義 
'''
def generate_rdkit_fps(mol):
    rdkit_fps = Chem.RDKFingerprint(mol)
    return rdkit_fps

#df_molの各要素に対して rdkit_fps を生成
sr_rdkit_fps_A = sr_mol_A.apply(lambda mol: generate_rdkit_fps(mol))
sr_rdkit_fps_B = sr_mol_B.apply(lambda mol: generate_rdkit_fps(mol))
#rdkit_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_rdkit_fps = random.choice(sr_rdkit_fps_A)
test_B_rdkit_fps = random.choice(sr_rdkit_fps_B)

#sr_rdkit_fpsの各要素に対して タニモト係数 を計算
sr_rdkit_fps_tnmt_A = sr_rdkit_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarity(test_A_rdkit_fps, fps) \
                                           if fps != test_A_rdkit_fps else None).dropna()
sr_rdkit_fps_tnmt_B = sr_rdkit_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_rdkit_fps, fps) \
                                           if fps != test_B_rdkit_fps else None).dropna()

print(sr_rdkit_fps_tnmt_A.describe()) 
print(sr_rdkit_fps_tnmt_B.describe())
'''

'\ndef generate_rdkit_fps(mol):\n    rdkit_fps = Chem.RDKFingerprint(mol)\n    return rdkit_fps\n\n#df_molの各要素に対して rdkit_fps を生成\nsr_rdkit_fps_A = sr_mol_A.apply(lambda mol: generate_rdkit_fps(mol))\nsr_rdkit_fps_B = sr_mol_B.apply(lambda mol: generate_rdkit_fps(mol))\n#rdkit_fps_df\n\n#テスト分子ABのfpsをランダムに抜き出す\ntest_A_rdkit_fps = random.choice(sr_rdkit_fps_A)\ntest_B_rdkit_fps = random.choice(sr_rdkit_fps_B)\n\n#sr_rdkit_fpsの各要素に対して タニモト係数 を計算\nsr_rdkit_fps_tnmt_A = sr_rdkit_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarity(test_A_rdkit_fps, fps)                                            if fps != test_A_rdkit_fps else None).dropna()\nsr_rdkit_fps_tnmt_B = sr_rdkit_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_rdkit_fps, fps)                                            if fps != test_B_rdkit_fps else None).dropna()\n\nprint(sr_rdkit_fps_tnmt_A.describe()) \nprint(sr_rdkit_fps_tnmt_B.describe())\n'

3. Morganフィンガープリント (Circularフィンガープリント)

    AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

    基準となる原子からある距離にある部分構造を数え上げていく

In [402]:
#Morgan Fingerprintを生成する関数を定義 
'''
def generate_morgan_fps(mol, radius, nBits):
    morgan_fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return morgan_fps

#df_molの各要素に対して maccs_fps を生成
sr_morgan_fps_A = sr_mol_A.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
sr_morgan_fps_B = sr_mol_B.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
#morgan_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_morgan_fps = random.choice(sr_morgan_fps_A)
test_B_morgan_fps = random.choice(sr_morgan_fps_B)

#sr_morgan_fpsの各要素に対して タニモト係数 を計
sr_morgan_fps_tnmt_A = sr_morgan_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarityo(test_A_morgan_fps, fps) \
                                           if fps != test_A_morgan_fps else None).dropna()
sr_morgan_fps_tnmt_B = sr_morgan_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_morgan_fps, fps) \
                                           if fps != test_B_morgan_fps else None).dropna()

print(sr_morgan_fps_tnmt_A.describe()) 
print(sr_morgan_fps_tnmt_B.describe())
'''

'\ndef generate_morgan_fps(mol, radius, nBits):\n    morgan_fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)\n    return morgan_fps\n\n#df_molの各要素に対して maccs_fps を生成\nsr_morgan_fps_A = sr_mol_A.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))\nsr_morgan_fps_B = sr_mol_B.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))\n#morgan_fps_df\n\n#テスト分子ABのfpsをランダムに抜き出す\ntest_A_morgan_fps = random.choice(sr_morgan_fps_A)\ntest_B_morgan_fps = random.choice(sr_morgan_fps_B)\n\n#sr_morgan_fpsの各要素に対して タニモト係数 を計\nsr_morgan_fps_tnmt_A = sr_morgan_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarityo(test_A_morgan_fps, fps)                                            if fps != test_A_morgan_fps else None).dropna()\nsr_morgan_fps_tnmt_B = sr_morgan_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_morgan_fps, fps)                                            if fps != test_B_morgan_fps else None).dropna()\n\nprint(sr_morgan_fps_tnmt_A.describe()) \nprint(sr_morgan_fps_tnm

## プロンプトの改善

In [427]:
#プロンプロ検証のため統一のテストデータを抽出する
test_data_row = df_buchwald_hardwig_smiles.sample(n=1)
test_A_smiles = test_data_row["A"].values[0]
test_B_smiles = test_data_row["B"].values[0]
test_Y_smiles = test_data_row["Y"].values[0]

print(test_data_row)
print(test_A_smiles)
print(test_B_smiles)

                    A          B                          Y
140  C1=CC(=CC=C1O)Br  CN1CCNCC1  CN1CCN(CC1)C2=CC=C(C=C2)O
C1=CC(=CC=C1O)Br
CN1CCNCC1


In [415]:
#トレーニングデータを任意行抽出する
df = df_buchwald_hardwig_smiles.copy()
df_training_data_smiles = df.sample(n=35)
#df_training_data_smiles

training_dataset_35 = \
"This is training dataset (A + B → Y):\n\
"
for _, row in df_training_data_smiles.iterrows():
    template = "A: " + row['A'] + "\\" + "\n" + "B: " + row['B'] + "\\" + "\n" + "Y: " + row['Y'] + "\\" + "\n" + "\\" + "\n"
    training_dataset_35 += template

In [434]:
#テストデータからタニモト係数を算出して上位からトレーニングデータを抽出する
df = df_buchwald_hardwig_smiles.copy()

#テストデータのmaccs fps
test_A_maccs_fps= AllChem.GetMACCSKeysFingerprint(Chem.MolFromSmiles(test_A_smiles))
test_B_maccs_fps= AllChem.GetMACCSKeysFingerprint(Chem.MolFromSmiles(test_B_smiles))
test_Y_maccs_fps= AllChem.GetMACCSKeysFingerprint(Chem.MolFromSmiles(test_Y_smiles))

df_smiles_maccs_fps_copy2 = df_smiles_maccs_fps.copy()
df_smiles_maccs_fps_copy2["tnmt_A"] = df_smiles_maccs_fps["maccs_fps_A"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_A_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy2["tnmt_B"] = df_smiles_maccs_fps["maccs_fps_B"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_B_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy2["tnmt_Y"] = df_smiles_maccs_fps["maccs_fps_Y"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps, maccs_fps))
df_smiles_maccs_fps_tnmt = df_smiles_maccs_fps_copy2.sort_values("tnmt_A", ascending=False)
df_smiles_maccs_fps_tnmt.head()

df_rank_tnmt_A_35 = df_smiles_maccs_fps_tnmt.iloc[:34, 0:3]

training_dataset_rank_tnmt_A = \
"This is training dataset (A + B → Y):\n\
"
for _, row in df_rank_tnmt_A_35.iterrows():
    template = "A: " + row['A'] + "\\" + "\n" + "B: " + row['B'] + "\\" + "\n" + "Y: " + row['Y'] + "\\" + "\n" + "\\" + "\n"
    training_dataset_rank_tnmt_A += template

df_rank_tnmt_A_35

Unnamed: 0,A,B,Y
140,C1=CC(=CC=C1O)Br,CN1CCNCC1,CN1CCN(CC1)C2=CC=C(C=C2)O
409,C1=CC(=CC=C1O)Br,C[C@@H]1CN(CCN1)C(=O)OC(C)(C)C,C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C
161,C1=CC(=CC=C1O)Br,C1COCCN1,C1COCCN1C2=CC=C(C=C2)O
314,COC1=CC=C(C=C1)Br,C1COCCC1CN,COC1=CC=C(C=C1)NCC2CCOCC2
170,COC1=CC(=CC=C1)Br,C1CNCCC1C#N,COC1=CC=CC(=C1)N2CCC(CC2)C#N
222,COC1=CC(=CC=C1)Br,CN(C)C(=O)C1=CC2=C(C(=C1)C3CCCN3)OC(=CC2=O)N4C...,CN(C)C(=O)C1=CC2=C(C(=C1)C3CCCN3C4=CC(=CC=C4)O...
244,COC1=CC(=CC=C1)Br,CC1=CC2=C(N1)C=C(C=C2)OC(F)(F)F,CC1=CC2=C(N1C3=CC(=CC=C3)OC)C=C(C=C2)OC(F)(F)F
171,COC1=CC(=CC=C1)Br,C1CNCCC1C(=O)N,COC1=CC=CC(=C1)N2CCC(CC2)C(=O)N
267,COC1=CC(=CC=C1)Br,COC(=O)C1=CC2=C(C(=C1)C3CCCN3)OC(=CC2=O)N4CCOCC4,COC1=CC=CC(=C1)N2CCCC2C3=CC(=CC4=C3OC(=CC4=O)N...
93,COC1=CC=C(C=C1)Br,CC(C)(C(=O)OC)N,CC(C)(C(=O)OC)NC1=CC=C(C=C1)OC


関数:　openai.Completion.create\
モデル: text-davinci-003

In [437]:
from context2 import training_dataset
openai.api_key = "sk-"

context = f"{training_dataset_rank_tnmt_A}\
            \
            A: {test_A_smiles}\
            B: {test_B_smiles}\
            Y1:\
            Y2:\
            Y3:\
            Y4:\
            Y5:\
            "

question = "Answer at least five candidates for 'y1 to y5'."

# Question-Answering
response = openai.Completion.create(
  engine="text-davinci-003",
  prompt=f"{context}+\n\+{question}",
  max_tokens=300,
  temperature=0.5,
  echo=False
)

print(response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\nY1: CN1CCN(CC1)C2=CC=C(C=C2)O\nY2: C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C\nY3: C1COCCN1C2=CC=C(C=C2)O\nY4: COC1=CC=CC(=C1)N2CCC(CC2)C#N\nY5: COC1=CC=CC(=C1)N2CCCC2C3=CC(=CC4=C3OC(=CC4=O)N5CCOCC5)C(=O)OC"
    }
  ],
  "created": 1687010671,
  "id": "cmpl-7SQpTuE6dZjB6aDNULBE9i1Gy8k3e",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 170,
    "prompt_tokens": 2897,
    "total_tokens": 3067
  }
}


In [438]:
# 上記のresponseからYのSMILESだけ取り出す
import re

text = response["choices"][0]["text"]
pattern = r":\s(.+)" #テンプレートの作成
product_Y_candidates = re.findall(pattern, text) #テンプレートをもとにSMILESを抜き出す
#product_Y_candidates = ["CN(C)[C@H]1CCN(C2=C(C=CC=C2)C#N)C1"]
print(product_Y_candidates)

['CN1CCN(CC1)C2=CC=C(C=C2)O', 'C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C', 'C1COCCN1C2=CC=C(C=C2)O', 'COC1=CC=CC(=C1)N2CCC(CC2)C#N', 'COC1=CC=CC(=C1)N2CCCC2C3=CC(=CC4=C3OC(=CC4=O)N5CCOCC5)C(=O)OC']


In [442]:
#正解のYから予想されたYのタニモト係数を計算する
df_product_Y_candidates = pd.DataFrame({"Y_candidates":product_Y_candidates})

df_product_Y_candidates["Y_candidates_mol"] = df_product_Y_candidates["Y_candidates"].\
                                                apply(lambda smiles: generate_mol(smiles))

df_product_Y_candidates["Y_candidates_maccs_fps"] = df_product_Y_candidates["Y_candidates_mol"].\
                                                apply(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))

df_product_Y_candidates["Y_candidates_tnmt"] = df_product_Y_candidates["Y_candidates_maccs_fps"].\
                                                apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps, maccs_fps))
df_product_Y_candidates.sort_values('Y_candidates_tnmt', ascending=False)


Unnamed: 0,Y_candidates,Y_candidates_mol,Y_candidates_maccs_fps,Y_candidates_tnmt
0,CN1CCN(CC1)C2=CC=C(C=C2)O,<rdkit.Chem.rdchem.Mol object at 0x147ab5e70>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,COC1=CC=CC(=C1)N2CCC(CC2)C#N,<rdkit.Chem.rdchem.Mol object at 0x147ab5770>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.813953
2,C1COCCN1C2=CC=C(C=C2)O,<rdkit.Chem.rdchem.Mol object at 0x147ab6ce0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.755556
1,C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C,<rdkit.Chem.rdchem.Mol object at 0x147ab49e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.696429
4,COC1=CC=CC(=C1)N2CCCC2C3=CC(=CC4=C3OC(=CC4=O)N...,<rdkit.Chem.rdchem.Mol object at 0x147ab7370>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.571429


関数:　openai.ChatCompletion.create\
モデル: gpt-3.5-turbo-16k

https://platform.openai.com/docs/guides/gpt/chat-completions-vs-completions

In [447]:
question = f"A: {test_A_smiles}\
            B: {test_B_smiles}\
            Y1:\
            Y2:\
            Y3:\
            Y4:\
            Y5:\
                "

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        #{"role": "system", "content": "Synthesize compound Y from test data compounds A and B."},
        {"role": "assistant", "content": f"{training_dataset_rank_tnmt_A}+\n\+{question}"},
        {"role": "user", "content": f"Answer at least five candidates for 'y1 to y5'."}],
        max_tokens=300,
        temperature=0,
)

print(response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Y1: CN1CCN(CC1)C2=CC=C(C=C2)O\\\nY2: C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C\\\nY3: C1COCCN1C2=CC=C(C=C2)O\\\nY4: COC1=CC=C(C=C1)NCC2CCOCC2\\\nY5: COC1=CC=CC(=C1)N2CCC(CC2)C#N",
        "role": "assistant"
      }
    }
  ],
  "created": 1687010933,
  "id": "chatcmpl-7SQthtDdrJpAXV4bOA80WwLuc3Lfv",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 122,
    "prompt_tokens": 2472,
    "total_tokens": 2594
  }
}


In [456]:
import re

text = response["choices"][0]["message"]["content"]
pattern = r"(?<=:\s)[A-Za-z0-9@+\-\[\]\(\)\\=#$]+"
matches = re.findall(pattern, text)

smiles_list = [match for match in matches if len(match) > 1]

#product_Y_candidates = [match.replace("\\", "") for match in product_Y_candidates]

print(smiles_list)

['CN1CCN(CC1)C2=CC=C(C=C2)O\\', 'C[C@@H]1CN(CCN1C2=CC=C(C=C2)O)C(=O)OC(C)(C)C\\', 'C1COCCN1C2=CC=C(C=C2)O\\', 'COC1=CC=C(C=C1)NCC2CCOCC2\\', 'COC1=CC=CC(=C1)N2CCC(CC2)C#N']


In [446]:
#正解のYから予想されたYのタニモト係数を計算する
df_product_Y_candidates = pd.DataFrame({"Y_candidates":product_Y_candidates})

df_product_Y_candidates["Y_candidates_mol"] = df_product_Y_candidates["Y_candidates"].\
                                                apply(lambda smiles: generate_mol(smiles))

df_product_Y_candidates["Y_candidates_maccs_fps"] = df_product_Y_candidates["Y_candidates_mol"].\
                                                apply(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))

df_product_Y_candidates["Y_candidates_tnmt"] = df_product_Y_candidates["Y_candidates_maccs_fps"].\
                                                apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps, maccs_fps))
df_product_Y_candidates

[23:08:05] SMILES Parse Error: syntax error while parsing: CN1CCN(CC1)C2=CC=C(C=C2)O\
[23:08:05] SMILES Parse Error: Failed parsing SMILES 'CN1CCN(CC1)C2=CC=C(C=C2)O\' for input: 'CN1CCN(CC1)C2=CC=C(C=C2)O\'
[23:08:05] SMILES Parse Error: syntax error while parsing: CN1CCN(CC1)C2=C(C=C(C=C2)O)Br\
[23:08:05] SMILES Parse Error: Failed parsing SMILES 'CN1CCN(CC1)C2=C(C=C(C=C2)O)Br\' for input: 'CN1CCN(CC1)C2=C(C=C(C=C2)O)Br\'
[23:08:05] SMILES Parse Error: syntax error while parsing: CN1CCN(CC1)C2=CC=C(C=C2)I\
[23:08:05] SMILES Parse Error: Failed parsing SMILES 'CN1CCN(CC1)C2=CC=C(C=C2)I\' for input: 'CN1CCN(CC1)C2=CC=C(C=C2)I\'
[23:08:05] SMILES Parse Error: syntax error while parsing: CN1CCN(CC1)C2=C(C=C(C=C2)Cl)Br\
[23:08:05] SMILES Parse Error: Failed parsing SMILES 'CN1CCN(CC1)C2=C(C=C(C=C2)Cl)Br\' for input: 'CN1CCN(CC1)C2=C(C=C(C=C2)Cl)Br\'


ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMACCSKeysFingerprint(NoneType)
did not match C++ signature:
    GetMACCSKeysFingerprint(RDKit::ROMol mol)

In [341]:
#ボツ
'''
for N in range(1, 101):
template = "A: {A" + str(N) + "}\\" + "\n" + "B: {B" + str(N) + "}\\" + "\n" + "Y: {Y" + str(N) + "}\\" + "\n" + "\\"
print(template)
'''

'\nfor N in range(1, 101):\ntemplate = "A: {A" + str(N) + "}\\" + "\n" + "B: {B" + str(N) + "}\\" + "\n" + "Y: {Y" + str(N) + "}\\" + "\n" + "\\"\nprint(template)\n'