### Yを予測するための効率的なトレーニングデータの抽出検討

In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import openai
import openai_api_key
import pubchempy
import random
 
from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, rdMHFPFingerprint
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions

In [312]:
#Buchwald-Hartwig Reaction datasetの読み込み
pb = "./practice/ord_dataset-00005539a1e04c809a9a78647bea649c.pb.gz"  
data = message_helpers.load_message(pb, dataset_pb2.Dataset) #生データであるjson型式のデータファイルとして読み取っている?
df = message_helpers.messages_to_dataframe(data.reactions, \
                                           drop_constant_columns=False) #jsonからdataframeへ変換

#元dfからA+B→Yとなる部分だけ抜き出してDataFrameにする
df = df[[ \
    'inputs["aryl halide"].components[0].identifiers[0].value', \
    'inputs["amine"].components[0].identifiers[0].value', \
    'outcomes[0].products[0].identifiers[0].value' \
    ]] 

df.columns = list('ABY') #列ラベルをわかりやすく A,B,Yに変換
df_buchwald_hardwig_smiles = df.drop_duplicates()\
                                .reset_index()\
                                .drop(columns=["index"])

#SMILESからMOLオブジェクトを生成する関数を定義
def generate_mol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        return None

# dfのすべてのA,B 要素に対してMolオブジェクトを生成する
df_mol_ABY = df_buchwald_hardwig_smiles.applymap(lambda smiles: generate_mol(smiles))\
                                        .rename(columns={"A": "mol_A", "B": "mol_B", "Y": "mol_Y"})

df_smiles_mol = pd.concat([df_buchwald_hardwig_smiles, df_mol_ABY], axis=1)
df_buchwald_hardwig_smiles


Unnamed: 0,A,B,Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...
...,...,...,...
468,C1=CC=C(C=C1)C(C2=CC=CC=C2)(C3=CC=CC=C3)N4C=NC...,C1CNCCN1,C1CN(CCN1)C2=CC=CC3=C2N=CN3C(C4=CC=CC=C4)(C5=C...
469,C1=CC(=C(C=C1C(F)(F)F)Br)F,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C(C=CC(=C2)C(F)(F)F)F
470,C1=CN=C(C=C1C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=NC=CC(=C2)C(F)(F)F
471,C1=CC2=C(C=CC(=C2N=C1)OS(=O)(=O)C(F)(F)F)Cl,CC(C)(C)OC(=O)N1CCNCC1,CC(C)(C)OC(=O)N1CCN(CC1)C2=C3C(=C(C=C2)Cl)C=CC=N3


In [314]:
#ボツ\
'''
newline = '\n'
df_buchwald_hardwig_smiles.insert(3, "n", newline)
df_smiles_stacked = df_buchwald_hardwig_smiles.stack()
df_smiles_stacked
df_smiles_mol_transpose = df_buchwald_hardwig_smiles_n.transpose()
df_smiles_mol_transpose
df_smiles_mol_transpose_stack = df_smiles_mol_transpose.stack()
df_smiles_mol_transpose_stack
'''

'\nnewline = \'\n\'\ndf_buchwald_hardwig_smiles.insert(3, "n", newline)\ndf_smiles_stacked = df_buchwald_hardwig_smiles.stack()\ndf_smiles_stacked\ndf_smiles_mol_transpose = df_buchwald_hardwig_smiles_n.transpose()\ndf_smiles_mol_transpose\ndf_smiles_mol_transpose_stack = df_smiles_mol_transpose.stack()\ndf_smiles_mol_transpose_stack\n'

## Buchwald-Hartwig Reaction データセット の FingerPrint　と　類似性

タニモト係数

Tanimoto Coefficient = c / (a + b - c)

・aは分子Aのビット配列で1が立っている数

・bは分子Bのビット配列で1が立っている数

・cは分子AとBで共通に1が立っている数

DataStructs.TanimotoSimilarity(test_fps, fps)

FingerPrint

1. MACCS Keys

    AllChem.GetMACCSKeysFingerprint(mol)

    166の部分構造について部分構造を有する場合は1が無い場合は0が格納される

In [315]:
#df_molの各要素に対して maccs_fps を生成
df_maccs_fps_ABY = df_mol_ABY.applymap(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))\
                                        .rename(columns={"mol_A": "maccs_fps_A", "mol_B": "maccs_fps_B", "mol_Y": "maccs_fps_Y"})
df_smiles_maccs_fps = pd.concat([df_buchwald_hardwig_smiles, df_maccs_fps_ABY], axis=1)
df_smiles_maccs_fps.head()

Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
df_dataset = df_smiles_maccs_fps
df_smiles_maccs_fps["tnmt_A"] = df_maccs_fps_ABY["maccs_fps_A"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_A_maccs_fps, maccs_fps))

In [316]:
#テスト分子ABのfpsをランダムに抜き出す
test_data_row = df_smiles_maccs_fps.iloc[45, :]
test_A_maccs_fps = test_data_row["maccs_fps_A"]
test_B_maccs_fps = test_data_row["maccs_fps_B"]
test_Y_maccs_fps = test_data_row["maccs_fps_Y"]

df_smiles_maccs_fps_copy = df_smiles_maccs_fps.copy()
#df_maccs_fps の各要素に対して タニモト係数 を生成
df_smiles_maccs_fps_copy["tnmt_A"] = df_maccs_fps_ABY["maccs_fps_A"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_A_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_B"] = df_maccs_fps_ABY["maccs_fps_B"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_B_maccs_fps, maccs_fps))
df_smiles_maccs_fps_copy["tnmt_Y"] = df_maccs_fps_ABY["maccs_fps_Y"].apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps, maccs_fps))
df_smiles_maccs_fps_tnmt = df_smiles_maccs_fps_copy

df_smiles_maccs_fps_tnmt.head()


Unnamed: 0,A,B,Y,maccs_fps_A,maccs_fps_B,maccs_fps_Y,tnmt_A,tnmt_B,tnmt_Y
0,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,CC(C)N1CCNCC1,CCOC1=C(C=C2C(=C1)N=CC(=C2NC3=C(C=C(C=C3)F)F)C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.212766,0.717949,0.529412
1,C1=CC=C(C=C1)I,CN1C=NC2=C1C=C(C(=C2F)N)C(=O)OC,CN1C=NC2=C1C=C(C(=C2F)NC3=CC=CC=C3)C(=O)OC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.461538,0.327586,0.473684
2,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)Cl)C,C1=CC(=CC=C1N)S(=O)(=O)N,CC1=NC(=C(C=C1)OC2=CC(=NC=C2)NC3=CC=C(C=C3)S(=...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.3125,0.126984,0.190476
3,C1=C(C=NC=C1Br)Br,CC(=O)N1CCNCC1,CC(=O)N1CCN(CC1)C2=CC(=CN=C2)Br,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.5,0.590909,0.596154
4,CN1CC(OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,C1=CC(=CC=C1N)N2C=CN=C2,CN1CC(OC2=C(C1)C=CC(=N2)NC3=CC=C(C=C3)N4C=CN=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.183673,0.340909,0.421875


In [224]:
#ボツ
'''
sr_mol_A = df_buchwald_hardwig_smiles["A"].apply(lambda smiles: generate_mol(smiles))
sr_mol_B = df_buchwald_hardwig_smiles["B"].apply(lambda smiles: generate_mol(smiles))

#sr_molの各要素に対して maccs_fps を生成
sr_maccs_fps_A = sr_mol_A.apply(lambda mol: generate_maccs_fps(mol))
sr_maccs_fps_B = sr_mol_B.apply(lambda mol: generate_maccs_fps(mol))

#sr_maccs_fpsの各要素に対して タニモト係数 を計算
sr_maccs_fps_tnmt_A = sr_maccs_fps_A.apply(lambda fps: calc_fps_tanimoto(test_A_maccs_fps, fps) \
                                           if fps != test_A_maccs_fps else None).dropna()
sr_maccs_fps_tnmt_B = sr_maccs_fps_B.apply(lambda fps: calc_fps_tanimoto(test_B_maccs_fps, fps)  \
                                           if fps != test_B_maccs_fps else None).dropna()

#print(sr_mol_A.describe())
#print(sr_mol_B.describe())


#buchwald_hardwig_smiles_df.insert(0, ":", ":")
#buchwald_hardwig_smiles_df.insert(2, "+", "+")
#buchwald_hardwig_smiles_df.insert(4, "→", "→")
#buchwald_hardwig_smiles_df.insert(6, "\", "\")
#buchwald_hardwig_smiles_df
'''

2. Topologicalフィンガープリント (RDKitフィンガープリント)

    Chem.RDKFingerprint(mol)

    一定の結合数に相当する原子と結合種類を格納(?)

In [225]:
#Topological(Rdkit) Fingerprintを生成する関数を定義 
'''
def generate_rdkit_fps(mol):
    rdkit_fps = Chem.RDKFingerprint(mol)
    return rdkit_fps

#df_molの各要素に対して rdkit_fps を生成
sr_rdkit_fps_A = sr_mol_A.apply(lambda mol: generate_rdkit_fps(mol))
sr_rdkit_fps_B = sr_mol_B.apply(lambda mol: generate_rdkit_fps(mol))
#rdkit_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_rdkit_fps = random.choice(sr_rdkit_fps_A)
test_B_rdkit_fps = random.choice(sr_rdkit_fps_B)

#sr_rdkit_fpsの各要素に対して タニモト係数 を計算
sr_rdkit_fps_tnmt_A = sr_rdkit_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarity(test_A_rdkit_fps, fps) \
                                           if fps != test_A_rdkit_fps else None).dropna()
sr_rdkit_fps_tnmt_B = sr_rdkit_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_rdkit_fps, fps) \
                                           if fps != test_B_rdkit_fps else None).dropna()

print(sr_rdkit_fps_tnmt_A.describe()) 
print(sr_rdkit_fps_tnmt_B.describe())
'''

count    465.000000
mean       0.166600
std        0.107268
min        0.019231
25%        0.081212
50%        0.142077
75%        0.248390
max        0.932450
Name: A, dtype: float64
count    472.000000
mean       0.102754
std        0.061207
min        0.000000
25%        0.050186
50%        0.101266
75%        0.142972
max        0.283859
Name: B, dtype: float64


3. Morganフィンガープリント (Circularフィンガープリント)

    AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

    基準となる原子からある距離にある部分構造を数え上げていく

In [226]:
#Morgan Fingerprintを生成する関数を定義 
'''
def generate_morgan_fps(mol, radius, nBits):
    morgan_fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return morgan_fps

#df_molの各要素に対して maccs_fps を生成
sr_morgan_fps_A = sr_mol_A.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
sr_morgan_fps_B = sr_mol_B.apply(lambda mol: generate_morgan_fps(mol, 2, 2048))
#morgan_fps_df

#テスト分子ABのfpsをランダムに抜き出す
test_A_morgan_fps = random.choice(sr_morgan_fps_A)
test_B_morgan_fps = random.choice(sr_morgan_fps_B)

#sr_morgan_fpsの各要素に対して タニモト係数 を計
sr_morgan_fps_tnmt_A = sr_morgan_fps_A.apply(lambda fps: DataStructs.TanimotoSimilarityo(test_A_morgan_fps, fps) \
                                           if fps != test_A_morgan_fps else None).dropna()
sr_morgan_fps_tnmt_B = sr_morgan_fps_B.apply(lambda fps: DataStructs.TanimotoSimilarity(test_B_morgan_fps, fps) \
                                           if fps != test_B_morgan_fps else None).dropna()

print(sr_morgan_fps_tnmt_A.describe()) 
print(sr_morgan_fps_tnmt_B.describe())
'''

count    472.000000
mean       0.122017
std        0.049253
min        0.056338
25%        0.090909
50%        0.116279
75%        0.139535
max        0.432432
Name: A, dtype: float64
count    449.000000
mean       0.078788
std        0.103687
min        0.000000
25%        0.000000
50%        0.025000
75%        0.151515
max        0.416667
Name: B, dtype: float64


## プロンプトの改善

In [310]:
test_A_smiles = test_data_row["A"]
test_B_smiles = test_data_row["B"]

print(test_A_smiles)
print(test_B_smiles)

C1=CC(=C(C=C1Cl)Br)C#N
CN(C)[C@H]1CCNC1


In [307]:
from context2 import training_dataset
openai.api_key = "sk-"

context = f"{training_dataset}\
            \
            A: {test_A_smiles}\
            B: {test_B_smiles}\
            Y1:\
            Y2:\
            Y3:\
            Y4:\
            Y5:\
            "

#question = "Answer at least five candidates for '?'. "

# Question-Answering
response = openai.Completion.create(
  engine="text-davinci-003",
  prompt=f"{context}",
  max_tokens=300,
  temperature=0,
  echo=False
)

print(response)

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": "\n\nY1: C1=CC(=C(C=C1Cl)Br)CN(C)[C@H]2CCNC2\nY2: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)Br)C#N\nY3: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)Br\nY4: C1=CC(=C(C=C1Br)C#N)CN(C)[C@H]2CCNC2\nY5: CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)C3=CC(=C(C=C3Br)C#N)CN(C)[C@H]4CCNC4"
    }
  ],
  "created": 1686964137,
  "id": "cmpl-7SEivOI2VdfqvQOIhalUZMdG2qE98",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 198,
    "prompt_tokens": 3014,
    "total_tokens": 3212
  }
}


In [308]:
# 上記のresponseからYのSMILESだけ取り出す
import re

text = response["choices"][0]["text"]
pattern = r":\s(.+)" #テンプレートの作成
product_Y_candidates = re.findall(pattern, text) #テンプレートをもとにSMILESを抜き出す

print(product_Y_candidates)

['C1=CC(=C(C=C1Cl)Br)CN(C)[C@H]2CCNC2', 'CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)Br)C#N', 'CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)Br', 'C1=CC(=C(C=C1Br)C#N)CN(C)[C@H]2CCNC2', 'CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)C3=CC(=C(C=C3Br)C#N)CN(C)[C@H]4CCNC4']


In [309]:
#予想されたYを正解のYと比較して評価する (タニモト係数)

df = df_smiles_maccs_fps_tnmt
df_test_AB_row = df.loc[(df["A"] == test_A_smiles) \
                                       & (df["B"] == test_B_smiles)]
df_test_AB_row

#正解のYを取り出す
test_Y_smiles = df_test_AB_row.loc[45, "Y"]
test_Y_maccs_fps2 = df_test_AB_row.loc[45, "maccs_fps_Y"]

#正解のYから予想されたYのタニモト係数を計算する
df_product_Y_candidates = pd.DataFrame({"Y_candidates":product_Y_candidates})

df_product_Y_candidates["Y_candidates_mol"] = df_product_Y_candidates["Y_candidates"].\
                                                apply(lambda smiles: generate_mol(smiles))

df_product_Y_candidates["Y_candidates_maccs_fps"] = df_product_Y_candidates["Y_candidates_mol"].\
                                                apply(lambda mol: AllChem.GetMACCSKeysFingerprint(mol))

df_product_Y_candidates["Y_candidates_tnmt"] = df_product_Y_candidates["Y_candidates_maccs_fps"].\
                                                apply(lambda maccs_fps: DataStructs.TanimotoSimilarity(test_Y_maccs_fps2, maccs_fps))
df_product_Y_candidates


Unnamed: 0,Y_candidates,Y_candidates_mol,Y_candidates_maccs_fps,Y_candidates_tnmt
0,C1=CC(=C(C=C1Cl)Br)CN(C)[C@H]2CCNC2,<rdkit.Chem.rdchem.Mol object at 0x1518f5540>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.7
1,CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)Br)C#N,<rdkit.Chem.rdchem.Mol object at 0x1518f73e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.765957
2,CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)Br,<rdkit.Chem.rdchem.Mol object at 0x1518f7220>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.765957
3,C1=CC(=C(C=C1Br)C#N)CN(C)[C@H]2CCNC2,<rdkit.Chem.rdchem.Mol object at 0x1518f5ee0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.7
4,CN(C)[C@H]1CCNC1C2=CC(=C(C=C2)C#N)C3=CC(=C(C=C...,<rdkit.Chem.rdchem.Mol object at 0x1518f6500>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.690909


In [313]:
#トレーニングデータを35行抽出する
df = df_buchwald_hardwig_smiles.copy()
df_training_data_smiles = df_buchwald_hardwig_smiles.sample(n=35)
df_training_data_smiles

Unnamed: 0,A,B,Y
145,CC1=NN(C(=C1)C2CN(CC3=C(O2)N=C(C=C3)Cl)CC#N)C,CC1=CN(C=N1)C2=C(C=C(C=C2)N)OC,CC1=NN(C(=C1)C2CN(CC3=C(O2)N=C(C=C3)NC4=CC(=C(...
184,CN1C[C@H](OC2=C(C1)C=CC(=N2)Cl)C3=CC=CC=C3,CC1=CN(C=N1)C2=CN=C(C=C2)N,CC1=CN(C=N1)C2=CN=C(C=C2)NC3=NC4=C(CN(C[C@H](O...
170,COC1=CC(=CC=C1)Br,C1CNCCC1C#N,COC1=CC=CC(=C1)N2CCC(CC2)C#N
46,CC1=NN(C=C1NC2=NC=C(C(=C2)Cl)C#N)C,CN1CCC2=C(C1=O)C(=CC=C2)N,CC1=NN(C=C1NC2=NC=C(C(=C2)NC3=CC=CC4=C3C(=O)N(...
76,C1=CC(=C(C=C1Cl)Cl)I,C1CNCCC1CO,C1CN(CCC1CO)C2=C(C=C(C=C2)Cl)Cl
288,C1=CC(=C(C=C1F)C(F)(F)F)Br,C[C@@H]1CNCCN1,C[C@@H]1CN(CCN1)C2=C(C=C(C=C2)F)C(F)(F)F
201,C1=CC(=CC=C1F)Br,C1CN(CCN1)CC2=CC=CC=C2,C1CN(CCN1CC2=CC=CC=C2)C3=CC=C(C=C3)F
248,C1=C(C=NC=C1Br)C(F)(F)F,CC(C)(C)OC(=O)N1C[C@@H]2C[C@H]1CN2,CC(C)(C)OC(=O)N1CC2CC1CN2C3=CN=CC(=C3)C(F)(F)F
292,C1=CC(=CC=C1Br)Br,CC1CNC1,CC1CN(C1)C2=CC=C(C=C2)Br
366,C1=CC2=NC=CN2C=C1Br,C1COCCN1,C1COCCN1C2=CN3C=CN=C3C=C2


In [None]:
li_variables_A = [f]

In [None]:
training_dataset = \
f"This is training dataset (A + B → Y):\
A: {df_training_data_smiles}\
B: {B1}\
Y: {Y1}\
\
A: {A2}\
B: {B2}\
Y: {Y2}\
\
A: {A3}\
B: {B3}\
Y: {Y3}\
\
A: {A4}\
B: {B4}\
Y: {Y4}\
\
A: {A5}\
B: {B5}\
Y: {Y5}\
\
A: {A6}\
B: {B6}\
Y: {Y6}\
\
A: {A7}\
B: {B7}\
Y: {Y7}\
\
A: {A8}\
B: {B8}\
Y: {Y8}\
\
A: {A9}\
B: {B9}\
Y: {Y9}\
\
A: {A10}\
B: {B10}\
Y: {Y10}\
\
A: {A11}\
B: {B11}\
Y: {Y11}\
\
A: {A12}\
B: {B12}\
Y: {Y12}\
\
A: {A13}\
B: {B13}\
Y: {Y13}\
\
A: {A14}\
B: {B14}\
Y: {Y14}\
\
A: {A15}\
B: {B15}\
Y: {Y15}\
\
A: {A16}\
B: {B16}\
Y: {Y16}\
\
A: {A17}\
B: {B17}\
Y: {Y17}\
\
A: {A18}\
B: {B18}\
Y: {Y18}\
\
A: {A19}\
B: {B19}\
Y: {Y19}\
\
A: {A20}\
B: {B20}\
Y: {Y20}\
\
A: {A21}\
B: {B21}\
Y: {Y21}\
\
A: {A22}\
B: {B22}\
Y: {Y22}\
\
A: {A23}\
B: {B23}\
Y: {Y23}\
\
A: {A24}\
B: {B24}\
Y: {Y24}\
\
A: {A25}\
B: {B25}\
Y: {Y25}\
\
A: {A26}\
B: {B26}\
Y: {Y26}\
\
A: {A27}\
B: {B27}\
Y: {Y27}\
\
A: {A28}\
B: {B28}\
Y: {Y28}\
\
A: {A29}\
B: {B29}\
Y: {Y29}\
\
A: {A30}\
B: {B30}\
Y: {Y30}\
\
A: {A31}\
B: {B31}\
Y: {Y31}\
\
A: {A32}\
B: {B32}\
Y: {Y32}\
\
A: {A33}\
B: {B33}\
Y: {Y33}\
\
A: {A34}\
B: {B34}\
Y: {Y34}\
\
A: {A35}\
B: {B35}\
Y: {Y35}\
"