### ordの全てのデータセットから反応データを持ってくる検証Jupyter NoteBook

ライブラリのインポート

In [2]:
import pandas as pd
import numpy as np

from ord_schema import message_helpers, validations
from ord_schema.proto import dataset_pb2
from rdkit import rdBase, Chem, DataStructs
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import AllChem, Draw, rdMHFPFingerprint, rdChemReactions
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs, Torsions

In [3]:
# 結局使うのはA,B,YのSMILES.
# 特許データがほとんど and 特許データは Reaction Smilesがデータに含まれている. 
# df にするのではなく, 元データからなるべく手数少なくReaction Smilesが取得できないか検証

# ord_dataset_idをcsvファイルから取得
dataset_ids = pd.read_csv("OpenReactionData_IDs.csv")["Dataset ID"]

Pandasによるデータの取得

In [4]:
"""
# ABYそれぞれの化合物SMILESを格納するリストを用意する
A = []
B = []
Y = []

#　各dataset idに対してディレクトリを走査して生データを取得する
for i in range(len(dataset_ids)):

    # dataset id Seriesからデータセットidを取ってくる
    id = dataset_ids[i] 

    # dataset idの'ord_dataset-'後2文字を取得する
    o = len('ord_dataset-') 
    dir_num = id[o:o+2]

    # dataset idのディレクトリへ移動
    # 生データを取得する
    pb = f"./ord_datasets/ord-data/data/{dir_num}/{id}.pb.gz"  
    data = message_helpers.load_message(pb, dataset_pb2.Dataset)
    subdataset = data.reactions
    
    # 各データセット中の各反応に対してReaction SMILESを取得する
    for j in range(len(subdataset)):
        rxn = data.reactions[j]
        rxn_smiles = message_helpers.get_reaction_smiles(rxn,
                                                         generate_if_missing = True,
                                                        )
           
        # Reaction SMILESから原料と生成物の数を取得する
        rxn_mol = AllChem.ReactionFromSmarts(rxn_smiles)
        numRP = rxn_mol.GetNumReactantTemplates(),\
                rxn_mol.GetNumProductTemplates()

        # (原料数, 生成物数) == (2,1) の場合のみABY SMILESを取得してリストへ貼り付ける
        if numRP == (2,1):            

            reactants = rxn_mol.GetReactants()
            A.append(Chem.MolToSmiles(reactants[0]))
            B.append(Chem.MolToSmiles(reactants[1]))

            product = rxn_mol.GetProducts()
            Y.append(Chem.MolToSmiles(product[0]))

#　それぞれのリストをデータフレームへまとめる
df_ABY = pd.DataFrame({"A":A, "B":B, "Y":Y})
"""

'\n# ABYそれぞれの化合物SMILESを格納するリストを用意する\nA = []\nB = []\nY = []\n\n#\u3000各dataset idに対してディレクトリを走査して生データを取得する\nfor i in range(len(dataset_ids)):\n\n    # dataset id Seriesからデータセットidを取ってくる\n    id = dataset_ids[i] \n\n    # dataset idの\'ord_dataset-\'後2文字を取得する\n    o = len(\'ord_dataset-\') \n    dir_num = id[o:o+2]\n\n    # dataset idのディレクトリへ移動\n    # 生データを取得する\n    pb = f"./ord_datasets/ord-data/data/{dir_num}/{id}.pb.gz"  \n    data = message_helpers.load_message(pb, dataset_pb2.Dataset)\n    subdataset = data.reactions\n    \n    # 各データセット中の各反応に対してReaction SMILESを取得する\n    for j in range(len(subdataset)):\n        rxn = data.reactions[j]\n        rxn_smiles = message_helpers.get_reaction_smiles(rxn,\n                                                         generate_if_missing = True,\n                                                        )\n           \n        # Reaction SMILESから原料と生成物の数を取得する\n        rxn_mol = AllChem.ReactionFromSmarts(rxn_smiles)\n        numRP = rxn_mol.GetNumRea

In [5]:
"""
df_ABY2 = df_ABY.drop_duplicates()\
                .reset_index()\
                .drop(columns=["index"])

df_ABY2.to_csv('./ord_datasets/ord_datasets_csv/ord_datasets.csv',
              index=False)
"""


'\ndf_ABY2 = df_ABY.drop_duplicates()                .reset_index()                .drop(columns=["index"])\n\ndf_ABY2.to_csv(\'./ord_datasets/ord_datasets_csv/ord_datasets.csv\',\n              index=False)\n'

In [6]:
#　ボツ
# dataset_id からデータを読み込んでcsvファイルを出力する関数を定義
# しかし, 途中で止まるのでこのアプローチは使えない
'''
def get_ord_data(dataset_ids):
    #idからディレクトリを取得する
    if f"{dataset_ids}.csv" in "./ord_datasets/ord_datasets_csv/":
        dir_name = dataset_ids[len('ord_dataset-'):len('ord_dataset-')+2]
        pb = f"./ord_datasets/ord-data/data/{dir_name}/{dataset_ids}.pb.gz"  
        data = message_helpers.load_message(pb, dataset_pb2.Dataset) 
        df = message_helpers.messages_to_dataframe(data.reactions, \
                                            drop_constant_columns=False)
        df.to_csv(f'./ord_datasets/ord_datasets_csv/{dataset_ids}.csv', index=False)

    else:
        pass
    
dataset_ids.apply(lambda dataset_id: get_ord_data(dataset_id))

    #message_helpers.find_submessages(data)
    #reaction_smiles = message_helpers.get_reaction_smiles(pb,
                                                        #generate_if_missing = True,
                                                        #)
    #df = message_helpers.messages_to_dataframe(data.reactions, \
                                            #drop_constant_columns=False)
    #df.to_csv(f'./ord_datasets/ord_datasets_csv/{id}.csv', index=False)


from rdkit import Chem
from rdkit.Chem import AllChem

reaction_smiles = "[C:1](Cl)(=O)C.[F:5][C:6]1[CH:7]=[C:8]([CH:12]=[CH:13][C:14]=1[N+:15]([O-:17])=[O:16])[C:9]([OH:11])=[O:10]>CO>[F:5][C:6]1[CH:7]=[C:8]([CH:12]=[CH:13][C:14]=1[N+:15]([O-:17])=[O:16])[C:9]([O:11][CH3:1])=[O:10]"

# RDKitの反応SMILESを読み込む
reaction = AllChem.ReactionFromSmarts(reaction_smiles)

# 反応物のSMILESを抽出
reactant_smiles = [Chem.MolToSmiles(reactant) for reactant in reaction.GetReactants()]

# 生成物のSMILESを抽出
product_smiles = [Chem.MolToSmiles(product) for product in reaction.GetProducts()]

# 結果を出力
print("反応物のSMILES:", reactant_smiles)
print("生成物のSMILES:", product_smiles)
reactant = reaction.GetReactants()
type(reactant[0])
    
'''

'\ndef get_ord_data(dataset_ids):\n    #idからディレクトリを取得する\n    if f"{dataset_ids}.csv" in "./ord_datasets/ord_datasets_csv/":\n        dir_name = dataset_ids[len(\'ord_dataset-\'):len(\'ord_dataset-\')+2]\n        pb = f"./ord_datasets/ord-data/data/{dir_name}/{dataset_ids}.pb.gz"  \n        data = message_helpers.load_message(pb, dataset_pb2.Dataset) \n        df = message_helpers.messages_to_dataframe(data.reactions,                                             drop_constant_columns=False)\n        df.to_csv(f\'./ord_datasets/ord_datasets_csv/{dataset_ids}.csv\', index=False)\n\n    else:\n        pass\n    \ndataset_ids.apply(lambda dataset_id: get_ord_data(dataset_id))\n\n    #message_helpers.find_submessages(data)\n    #reaction_smiles = message_helpers.get_reaction_smiles(pb,\n                                                        #generate_if_missing = True,\n                                                        #)\n    #df = message_helpers.messages_to_dataframe(data.reactions,

NumPyによるデータの取得

データ取得に関しては一旦やめる

In [19]:
# dataset_ids Series をndarray配列へ変換する
nIDs = np.array(dataset_ids)
print(type(nIDs))
print(nIDs[:5])

<class 'numpy.ndarray'>
['ord_dataset-00005539a1e04c809a9a78647bea649c'
 'ord_dataset-018fd0e1351f4fd09b20fcddd97b4c7a'
 'ord_dataset-01dbb772c5e249108f0b191ed17a2c0c'
 'ord_dataset-02ee2261663048188cf6d85d2cc96e3f'
 'ord_dataset-0387783899c642a8b7eb4ba379bcdf5d']


In [20]:
# リスト内方表記にした方が速い?
"""
def np_alldata(id):

    # dataset idの'ord_dataset-'後2文字を取得する
    o = len('ord_dataset-')
    dir_num = id[o:o+2]

    # dataset idのディレクトリへ移動
    # 生データを取得する
    pb = f"./ord_datasets/ord-data/data/{dir_num}/{id}.pb.gz"  
    data = message_helpers.load_message(pb, dataset_pb2.Dataset)

    # 取得した生データをndarray配列へ変換
    nd_subdata = np.array(data.reactions)
    
    # 各データセット中の各反応に対してReaction SMILESを取得する
    nd_RXNData = np.array([rxndata for rxndata in nd_subdata])
    nd_RXNSmiles = np.array([message_helpers.get_reaction_smiles(rxn,
                                                                 generate_if_missing = True,
                                                                ) for rxn in nd_RXNData])
    
    
    return rxn_smiles


df_ABY = pd.DataFrame({"A":A, "B":B, "Y":Y})

new_all_data_1 = np.frompyfunc(all_data_1, 1, 1)


nsubdataset = new_all_data_1(nIDs)
"""


'\ndef np_alldata(id):\n\n    # dataset idの\'ord_dataset-\'後2文字を取得する\n    o = len(\'ord_dataset-\')\n    dir_num = id[o:o+2]\n\n    # dataset idのディレクトリへ移動\n    # 生データを取得する\n    pb = f"./ord_datasets/ord-data/data/{dir_num}/{id}.pb.gz"  \n    data = message_helpers.load_message(pb, dataset_pb2.Dataset)\n\n    # 取得した生データをndarray配列へ変換\n    nd_subdata = np.array(data.reactions)\n    \n    # 各データセット中の各反応に対してReaction SMILESを取得する\n    nd_RXNData = np.array([rxndata for rxndata in nd_subdata])\n    nd_RXNSmiles = np.array([message_helpers.get_reaction_smiles(rxn,\n                                                                 generate_if_missing = True,\n                                                                ) for rxn in nd_RXNData])\n    \n    \n    return rxn_smiles\n\n\ndf_ABY = pd.DataFrame({"A":A, "B":B, "Y":Y})\n\nnew_all_data_1 = np.frompyfunc(all_data_1, 1, 1)\n\n\nnsubdataset = new_all_data_1(nIDs)\n'

取得したSMILESからFingerPrintを生成

こちらはNumPyで行ないいたい

In [9]:
# df_ABY2データフレームをndarray配列へ変換
df_ABY2 = pd.read_csv('./ord_datasets/ord_datasets_csv/ord_datasets.csv', index_col=None)
nd_Smiles = df_ABY2.iloc[:, 1:4].values
nd_Smiles.shape

(335280, 3)

In [33]:
# すべての SMILES をMolオブジェクトへ変換

uf_MolFromSmiles = np.frompyfunc(Chem.MolFromSmiles, 1,1)
nd_Mol = uf_MolFromSmiles(nd_Smiles)
nd_Mol.shape

[12:58:27] Explicit valence for atom # 14 Cl, 2, is greater than permitted
[12:58:27] Explicit valence for atom # 14 Cl, 2, is greater than permitted
[12:58:35] Explicit valence for atom # 0 N, 5, is greater than permitted
[12:58:35] Explicit valence for atom # 0 Br, 2, is greater than permitted
[12:58:37] Explicit valence for atom # 0 Cl, 7, is greater than permitted
[12:58:45] Explicit valence for atom # 1 C, 5, is greater than permitted
[12:58:45] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[12:58:45] Explicit valence for atom # 1 Cl, 7, is greater than permitted
[12:58:46] Explicit valence for atom # 0 N, 5, is greater than permitted
[12:58:46] Explicit valence for atom # 0 N, 5, is greater than permitted
[12:58:46] Explicit valence for atom # 0 H, 2, is greater than permitted
[12:58:47] Explicit valence for atom # 0 H, 2, is greater than permitted
[12:58:47] Explicit valence for atom # 0 N, 5, is greater than permitted
[12:58:48] Explicit valence for atom # 0 H,

(335280, 3)

KeyboardInterrupt: 

[13:00:58] Explicit valence for atom # 14 Cl, 2, is greater than permitted
[13:00:58] Explicit valence for atom # 14 Cl, 2, is greater than permitted
[13:01:05] Explicit valence for atom # 0 N, 5, is greater than permitted
[13:01:06] Explicit valence for atom # 0 Br, 2, is greater than permitted
[13:01:07] Explicit valence for atom # 0 Cl, 7, is greater than permitted
[13:01:15] Explicit valence for atom # 1 C, 5, is greater than permitted
[13:01:15] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[13:01:16] Explicit valence for atom # 1 Cl, 7, is greater than permitted
[13:01:16] Explicit valence for atom # 0 N, 5, is greater than permitted
[13:01:16] Explicit valence for atom # 0 N, 5, is greater than permitted
[13:01:16] Explicit valence for atom # 0 H, 2, is greater than permitted
[13:01:18] Explicit valence for atom # 0 H, 2, is greater than permitted
[13:01:18] Explicit valence for atom # 0 N, 5, is greater than permitted
[13:01:19] Explicit valence for atom # 0 H,

(335280, 3)

In [35]:
# 上のエラーに示すように, SMILESがエラーだった場合MolオブジェクトがNoneになる.
# Noneを含む行を削除する
nd_Mol_copy = nd_Mol
nd_Mol2 = np.array([row for row in nd_Mol_copy if not any(elem is None for elem in row)])
nd_Mol2.shape

(335250, 3)

In [36]:
# すべての Molオブジェクト を Maccs Fps へ変換
# 構造エラーでMolがNoneになっている可能性がある

u_GetMACCSFps = np.frompyfunc(AllChem.GetMACCSKeysFingerprint, 1, 1)
nd_MACCSFps = u_GetMACCSFps(nd_Mol2)
nd_MACCSFps.shape


In [42]:
# MACCSFPsとSMILESを対応させるため削除したnd_Mol2からSMILESを生成
uf_MolToSmiles = np.frompyfunc(Chem.MolToSmiles, 1,1)
nd_Smiles2 = uf_MolToSmiles(nd_Mol2)
df_Smiles = pd.DataFrame(nd_Smiles2,columns=["A", "B", "Y"])
df_Smiles


Unnamed: 0,A,B,Y
0,CC(C)N1CCNCC1,CCOC(=O)c1cnc2cc(OCC)c(Br)cc2c1Nc1ccc(F)cc1F,CCOC(=O)c1cnc2cc(OCC)c(N3CCN(C(C)C)CC3)cc2c1Nc...
1,Ic1ccccc1,COC(=O)c1cc2c(ncn2C)c(F)c1N,COC(=O)c1cc2c(ncn2C)c(F)c1Nc1ccccc1
2,Nc1ccc(S(N)(=O)=O)cc1,Cc1ccc(Oc2ccnc(Cl)c2)c(C)n1,Cc1ccc(Oc2ccnc(Nc3ccc(S(N)(=O)=O)cc3)c2)c(C)n1
3,Brc1cncc(Br)c1,CC(=O)N1CCNCC1,CC(=O)N1CCN(c2cncc(Br)c2)CC1
4,Nc1ccc(-n2ccnc2)cc1,CN1Cc2ccc(Cl)nc2OC(c2ccccc2)C1,CN1Cc2ccc(Nc3ccc(-n4ccnc4)cc3)nc2OC(c2ccccc2)C1
...,...,...,...
335245,CC(C)(C)OC(=O)[N:8]1[CH2:9][C@H:10]2[CH2:11][C...,[Cl:46][c:47]1[c:48]([CH2:49][NH:50][CH:51]2[C...,[NH:8]1[CH2:9][C@H:10]2[CH2:11][C:12]([c:27]3[...
335246,CC(C)(C)OC(=O)[N:8]1[CH2:9][C@H:10]2[CH2:11][C...,[Cl:46][c:47]1[c:48]([CH2:49][NH:50][CH:51]2[C...,[NH:8]1[CH2:9][C@H:10]2[CH2:11][C:12]([c:27]3[...
335247,CC(C)(C)OC(=O)[N:8]1[CH2:9][C@H:10]2[CH2:11][C...,[CH:46]1([NH:49][CH2:50][c:51]2[c:52]([CH3:61]...,[NH:8]1[CH2:9][C@H:10]2[CH2:11][C:12]([c:27]3[...
335248,CC(C)(C)OC(=O)[N:8]1[CH2:9][C@H:10]2[CH2:11][C...,[CH:46]1([NH:49][CH2:50][c:51]2[c:52]([CH3:62]...,[NH:8]1[CH2:9][C@H:10]2[CH2:11][C:12]([c:27]3[...


In [48]:
# nd_MACCSFpsもdf化してdf_Smilesと合わせてcsvへ
df_MACCSFps = pd.DataFrame(nd_MACCSFps,columns=["maccs_A", "maccs_B", "maccs_Y"])
df_SmilesMACCSFps = pd.concat([df_Smiles, df_MACCSFps], axis=1) 
df_SmilesMACCSFps.to_csv("./ord_datasets/ord_datasets_csv/ord_SmilesMACCSFps.csv")

In [39]:
# ufunc化した上記の方法が圧倒的に速い
"""
nd_MACCSFps2 = np.array([[AllChem.GetMACCSKeysFingerprint(mol) for mol in row] for row in nd_Mol2])
nd_MACCSFps2
"""

KeyboardInterrupt: 

In [25]:
smiles = "[N2:1][C:2]1=[CH:30][C:6]2=[C:5]([CH:4]=[CH:3]1)[N:9]=[CH:8][N:7]2[CH2:10][CH2:11][CH2:12][CH2:13][N:14]1[CH2:15][CH2:16][N:17]([C:20]2=[CH:21][C:22]3=[C:23]([O:24][CH2:25][CH2:26][O:27]3)[CH:28]=[CH:29]2)[CH2:18][CH2:19]1"
mol = Chem.MolFromSmiles(smiles)
print(mol)

None


[12:52:13] SMILES Parse Error: syntax error while parsing: [N2:1][C:2]1=[CH:30][C:6]2=[C:5]([CH:4]=[CH:3]1)[N:9]=[CH:8][N:7]2[CH2:10][CH2:11][CH2:12][CH2:13][N:14]1[CH2:15][CH2:16][N:17]([C:20]2=[CH:21][C:22]3=[C:23]([O:24][CH2:25][CH2:26][O:27]3)[CH:28]=[CH:29]2)[CH2:18][CH2:19]1
[12:52:13] SMILES Parse Error: Failed parsing SMILES '[N2:1][C:2]1=[CH:30][C:6]2=[C:5]([CH:4]=[CH:3]1)[N:9]=[CH:8][N:7]2[CH2:10][CH2:11][CH2:12][CH2:13][N:14]1[CH2:15][CH2:16][N:17]([C:20]2=[CH:21][C:22]3=[C:23]([O:24][CH2:25][CH2:26][O:27]3)[CH:28]=[CH:29]2)[CH2:18][CH2:19]1' for input: '[N2:1][C:2]1=[CH:30][C:6]2=[C:5]([CH:4]=[CH:3]1)[N:9]=[CH:8][N:7]2[CH2:10][CH2:11][CH2:12][CH2:13][N:14]1[CH2:15][CH2:16][N:17]([C:20]2=[CH:21][C:22]3=[C:23]([O:24][CH2:25][CH2:26][O:27]3)[CH:28]=[CH:29]2)[CH2:18][CH2:19]1'
