The construction of test-set data

In [8]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt,MolLogP,HeavyAtomCount
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem.Pharm2D import Generate
from tqdm import tqdm
from multiprocessing import Pool

In [3]:
import fastparquet
from fastparquet import ParquetFile
filename = 'data/validation-00000-of-00001-9368b7243ba1bff8.parquet'
pf = ParquetFile(filename)

In [5]:
#Test set
dF = pf.to_pandas()
dF

Unnamed: 0,smiles
0,Cc1cccc(C[NH2+]Cc2ccc(OC(F)(F)F)cc2)c1
1,CCC(CBr)(CBr)NS(=O)(=O)c1ccc(Br)cc1Cl
2,NC(=O)N(CCCC[NH+]1CCCN(c2cccc3c2OCCCC3)CC1)c1c...
3,O=C(NCCc1nc(-c2ccccc2)cs1)C1[NH2+]CCc2[nH]cnc21
4,CCOC(CCC=CC(=O)NO)C(OC(=O)Nc1ccc(C)cc1)c1ccccc...
...,...
999991,COc1cccc(OC)c1C[NH+]1CCc2nnc(C(C)NC(=O)c3ccco3...
999992,Nc1nc2cc(Br)cnc2n1-c1cccc(Cl)c1F
999993,COc1cc2c(cc1C=C1C(=O)NC(=S)N(c3ccc(C)cc3C)C1=O...
999994,CCn1nc(C)c(Cl)c1CC([NH3+])c1ccc(Br)o1


In [9]:
#Training set
data_path = 'data/pubchem10M_filter.txt'
df_ok_path = 'data/pubchem/pubchem_filter1.csv'
df_ok = pd.read_csv(df_ok_path)
df_ok

Unnamed: 0,SMILES,STATE,MW,LogP,HeavyAtomCount
0,CN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1,OK,340.423,2.72580,25
1,CC[NH+](CC)C1CCC([NH2+]C2CC2)(C(=O)[O-])C1,OK,241.355,-2.32190,17
2,COCC(CNC(=O)c1ccc2c(c1)NC(=O)C2)OC,OK,278.308,0.57240,20
3,OCCn1cc(CNc2cccc3c2CCCC3)nn1,OK,272.352,1.76130,20
4,O=C(NCc1ccc(F)cc1)N1CC=C(c2c[nH]c3ccccc23)CC1,OK,349.409,4.30590,26
...,...,...,...,...,...
8495820,CC(=O)C(C)Cc1ccc2c(c1)NC(=O)CO2,OK,233.267,1.78510,17
8495821,O=C(Cn1cc(C=C2NC(=O)N(Cc3ccccc3F)C2=O)c2ccccc2...,OK,468.488,4.51200,35
8495822,COc1cc(C(F)(F)F)cc(n2c(C)nc(C#Cc3ccnc(Cl)c3)c2...,OK,405.807,4.96474,28
8495823,O=C(NCc1ccccc1)N1CCC2(CC1)OCCc1c2[nH]c2ccccc12,OK,375.472,3.94150,28


In [13]:
df1 = df_ok[['SMILES']].rename(columns={'SMILES':'smiles'})
df1

Unnamed: 0,smiles
0,CN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1
1,CC[NH+](CC)C1CCC([NH2+]C2CC2)(C(=O)[O-])C1
2,COCC(CNC(=O)c1ccc2c(c1)NC(=O)C2)OC
3,OCCn1cc(CNc2cccc3c2CCCC3)nn1
4,O=C(NCc1ccc(F)cc1)N1CC=C(c2c[nH]c3ccccc23)CC1
...,...
8495820,CC(=O)C(C)Cc1ccc2c(c1)NC(=O)CO2
8495821,O=C(Cn1cc(C=C2NC(=O)N(Cc3ccccc3F)C2=O)c2ccccc2...
8495822,COc1cc(C(F)(F)F)cc(n2c(C)nc(C#Cc3ccnc(Cl)c3)c2...
8495823,O=C(NCc1ccccc1)N1CCC2(CC1)OCCc1c2[nH]c2ccccc12


In [None]:
#Remove duplicate parts of the test set and training set
set_diff_df = pd.concat([dF, df1, df1]).drop_duplicates(keep=False)
print(set_diff_df)

                                                   smiles
3         O=C(NCCc1nc(-c2ccccc2)cs1)C1[NH2+]CCc2[nH]cnc21
6       COc1cc(C2C(c3ccc(Cl)cc3)=C(O)C(=O)N2c2ccc(Cl)c...
9                      O=C([O-])c1cnc(-c2cc(F)ccc2F)cc1Cl
22               CC1=Cc2ccccc2C1C[SiH2]CC1C(C)=Cc2ccccc21
24      Cc1ccc2ccccc2c1COc1ccc(C=C(C#N)C(=O)Nc2ccc(F)c...
...                                                   ...
999984  C=CCOC(=O)NCC(CNC(=O)NC(CNC(=O)OCC=C)CNC(=O)N(...
999988  CC(C)c1csc(CCNC(=O)C(=O)c2c(-c3ccccc3)cc3ccccn...
999990  CC(C)(C)NC(=O)C(Cc1ccccc1)N(Cc1ccc(Br)cc1)C(=O...
999992                   Nc1nc2cc(Br)cnc2n1-c1cccc(Cl)c1F
999995    COc1cccc(-c2cc(=O)n3c4ccccc4n(Cc4ccccc4)c3n2)c1

[263316 rows x 1 columns]


In [17]:
def evaluate(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return [smile, "INVALID", -999, -999, -999]
    desc_list = [MolWt(mol), MolLogP(mol),HeavyAtomCount(mol)]
    return [smile , "OK"]+desc_list

In [18]:
smiles_list = set_diff_df['smiles'].to_list()
res = list(tqdm(map(evaluate, smiles_list),total= len(smiles_list)))

100%|██████████| 263316/263316 [02:54<00:00, 1510.98it/s]


In [19]:
df = pd.DataFrame(res)
df = df.iloc[:,:5]

df.columns=["SMILES", "STATE","MW", "LogP","HeavyAtomCount"]
df=df[ ~ df['STATE'].str.contains('INVALID', na=False)] 
#df['MW']  = pd.to_numeric(df['MW']) 

print("num_mol_succeed:",len(df))

num_mol_succeed: 263316


In [21]:
df_ok_test = df[
    df.MW.between(*[12, 600]) & # MW
    df.LogP.between(*[-7, 5]) & #LogP
    df.HeavyAtomCount.between(*[3, 50])
    ]
print("符合要求分子数：" , len(df_ok_test))
df_ok_test.head()

符合要求分子数： 114171


Unnamed: 0,SMILES,STATE,MW,LogP,HeavyAtomCount
0,O=C(NCCc1nc(-c2ccccc2)cs1)C1[NH2+]CCc2[nH]cnc21,OK,354.459,1.0527,25
2,O=C([O-])c1cnc(-c2cc(F)ccc2F)cc1Cl,OK,268.626,2.0437,18
5,COC(=O)C(CCSC)NC(=O)c1c(C)nn(-c2ccccc2)c1C,OK,361.467,2.51364,25
8,CCn1c(SC(C)C(=O)N2CCc3ccccc32)nnc1-c1cccs1,OK,384.53,4.0964,26
10,CC[NH2+]Cc1cc(C(=O)N2CC[NH+](C3CC(n4cc(-c5[nH]...,OK,556.597,0.4095,40


In [60]:
df_ok_test.to_csv('data/transltation_test/pubchem_100k_filter.csv',index= False,header=True)

In [22]:
def getfp(smile):
    mol = Chem.MolFromSmiles(smile)
    try:
        fp = Generate.Gen2DFingerprint(mol,sigFactory)
    except IndexError:
        pass
    else:
        fp_list = list(fp.GetOnBits())
        if fp_list ==[]:
            pass
        else:
        #fp_list=fp_list.astype(int)
            return [smile] + fp_list

fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
featFactory = ChemicalFeatures.BuildFeatureFactory(fdefName)
sigFactory = SigFactory(featFactory,minPointCount=2,maxPointCount=3)
sigFactory.SetBins([(0,2),(2,5),(5,8)])
sigFactory.Init()
sigFactory.GetSigSize()

2988

In [23]:
mols = df_ok_test['SMILES'].values.tolist()
fp = list(tqdm(map(getfp,mols),total=len(df_ok_test), desc='%d 监视进度:' ))

fp_filter_None = []
for item in fp:
    if item != None :
        fp_filter_None.append(item)

with open('fp_list_20230718.txt','w') as f:
    for i in fp_filter_None:
        for j in i:
            f.write(str(j))
            f.write(' ')
        f.write('\n')
    f.close()

%d 监视进度:: 100%|██████████| 114171/114171 [11:32<00:00, 164.85it/s]


整合fp_list

In [25]:
from tqdm import tqdm
fp=[]
file=open('fp_list_20230718.txt' , mode='r',encoding='UTF-8')
contents = file.readlines()
for msg in tqdm(contents,desc='进度： %s'):
    msg = msg.strip('\n')
    adm = msg.split(' ')
    fp.append(adm)
file.close()

进度： %s: 100%|██████████| 71009/71009 [00:00<00:00, 172216.53it/s]


In [None]:
def split(sm):
    '''
    function: Split SMILES into words. Care for Cl, Br, Si, Se, Na etc.
    input: A SMILES
    output: A string with space between words
    '''
    arr = []
    i = 0
    while i < len(sm)-1:
        if not sm[i] in ['%', 'C', 'B', 'S', 'N', 'R', 'X', 'L', 'A', 'M', \
                        'T', 'Z', 's', 't', 'H', '+', '-', 'K', 'F']:
            arr.append(sm[i])
            i += 1
        elif sm[i]=='%':
            arr.append(sm[i:i+3])
            i += 3
        elif sm[i]=='C' and sm[i+1]=='l':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='C' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='C' and sm[i+1]=='u':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='N' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='N' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='R' and sm[i+1]=='b':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='R' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='X' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='L' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='l':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='s':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='g':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='u':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='M' and sm[i+1]=='g':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='M' and sm[i+1]=='n':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='T' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='Z' and sm[i+1]=='n':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='s' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='s' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='t' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='H' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='2':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='3':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='4':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='2':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='3':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='4':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='K' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='F' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        else:
            arr.append(sm[i])
            i += 1
    if i == len(sm)-1:
        arr.append(sm[i])
    return ' '.join(arr) 

In [28]:
pharm_corpus_path = 'data/transltation_test/pharm_corpus1.txt'
smi_corpus_path = 'data/transltation_test/smi_corpus1.txt'
smi_pharm_filepath = 'data/transltation_test/smi_pharm_corpus1.txt'
#from utils import split
with open(pharm_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(fp))):
        if fp[i] is None:
            pass
        else:
            sm = fp[i][0]
            word = str(fp[i][1:])
            word  = word.replace("'", "")
            word  = word.replace(",", "")
            word  = word.replace("[", "")
            word  = word.replace("]", "")
        #word
            f1.write(word + '\n')
            f2.write(split(sm) +'\n')
print('Built pharm & smi corpus file!')

100%|██████████| 71009/71009 [00:03<00:00, 23448.30it/s]

Built pharm & smi corpus file!





In [53]:
fp=[]
with open ('smiles_20230718.txt' ,'w')as f:
    file=open('fp_list_20230718.txt' , mode='r',encoding='UTF-8')
    contents = file.readlines()   
    for msg in tqdm(contents):
        msg = msg.strip('\n')
        adm = msg.split(' ')
        fp.append(adm[0])
        f.write(adm[0]+'\n')
    file.close()


100%|██████████| 71009/71009 [00:00<00:00, 232033.68it/s]


In [39]:
def get_inchi(smile):
    mol = Chem.MolFromSmiles(smile)
    inchi = Chem.MolToInchi(mol)
    return smile +' ' + inchi

inchi = list(tqdm(map(get_inchi,fp),total=len(fp), desc='%d 监视进度:' ))
inchi_filter_None = []
for item in inchi:
    if item != None :
        inchi_filter_None.append(item)

with open('inchi_list_20230718.txt','w') as f:
    for i in inchi_filter_None:
        f.write(i)
        f.write('\n')
    f.close()

%d 监视进度:: 100%|██████████| 71009/71009 [00:31<00:00, 2290.14it/s]


In [40]:
def split_inchi(inchi):
    '''
    function: Split inchi into words. Care for Cl, Br, Si, Se, Na etc.
    input: A inchi
    output: A string with space between words
    '''
    arr = []
    i = 0
    while i < len(inchi)-1:
        if not inchi[i] in ['%', 'C', 'B', 'S', 'N', 'R', 'X', 'L', 'A', 'M', \
                        'T', 'Z', 's', 't', 'H', '+', 'K', 'F','I','1','2','3','4']:
            arr.append(inchi[i])
            i += 1
        elif inchi[i]=='%':
            arr.append(inchi[i:i+3])
            i += 3
        elif inchi[i]=='C' and inchi[i+1]=='l':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='C' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='C' and inchi[i+1]=='u':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='N' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='N' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='R' and inchi[i+1]=='b':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='R' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='X' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='L' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='l':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='s':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='g':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='u':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='M' and inchi[i+1]=='g':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='M' and inchi[i+1]=='n':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='T' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='Z' and inchi[i+1]=='n':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='s' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='s' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='t' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='H' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='2':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='3':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='4':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='K' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='F' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='I' and inchi[i+1]=='n' and inchi[i+2]=='C':
            arr.append(inchi[i:i+9])
            i += 9
        elif inchi[i]=='1' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2   
        elif inchi[i]=='2' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2   
        elif inchi[i]=='3' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2 
        elif inchi[i]=='4' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2 
        else: 
            arr.append(inchi[i])
            i += 1
    if i == len(inchi)-1:
        arr.append(inchi[i])
    return ' '.join(arr) 

In [43]:
inchi_corpus_path = 'data/transltation_test/inchi_corpus1.txt'
smi_corpus_path = 'data/transltation_test/smi_corpus2.txt'
smi_inchi_filepath = 'data/transltation_test/smi_inchi_corpus1.txt'
#from utils import split
with open(inchi_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(inchi))):
        if inchi[i] is None:
            pass
        else:
            sm = inchi[i][0]
            word = inchi[i][1]

            f1.write(split_inchi(word) + '\n')
            f2.write(split(sm) +'\n')
print('Built inchi & smi corpus file!')

100%|██████████| 71009/71009 [00:10<00:00, 7093.23it/s]

Built inchi & smi corpus file!





In [50]:
from pubchemfp import GetPubChemFPs
def GetPubChemFpBits(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol2 = Chem.AddHs(mol)
    try:
        result = GetPubChemFPs(mol2)
    except:
        pass
    else:
        bit = []
        for i in range(881):
            if result[i] == True:
                bit.append(i)
        return [smiles] + bit

In [None]:
pubfp = list(tqdm(map(GetPubChemFpBits,fp),total=len(fp), desc='%d 监视进度:' ))

fp_filter_None = []
for item in pubfp:
    if item != None :
        fp_filter_None.append(item)

    with open('pubfp_list_20230718.txt' ,'w') as f:
        for i in fp_filter_None:
            for j in i:
                f.write(str(j))
                f.write(' ')
            f.write('\n')
        f.close()

In [55]:
pubchem_corpus_path = 'data/transltation_test/pubchemfp_corpus1.txt'
smi_corpus_path = 'data/transltation_test/smi_corpus3.txt'
smi_pubchem_filepath = 'data/transltation_test/smi_pubchemfp_corpus1.txt'
#from utils import split
with open(pubchem_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(pubfp))):
        if pubfp[i] is None:
            pass
        else:
            sm = pubfp[i][0]
            word = str(pubfp[i][1:])
            word  = word.replace("'", "")
            word  = word.replace(",", "")
            word  = word.replace("[", "")
            word  = word.replace("]", "")
        #word
            f1.write(word + '\n')
            f2.write(split(sm) +'\n')
print('Built pharm & smi corpus file!')

100%|██████████| 71009/71009 [00:03<00:00, 21856.72it/s]

Built pharm & smi corpus file!





In [56]:
pharm_corpus_path = 'data/transltation_test/pharm_corpus1.txt'
smi_corpus_path = 'data/transltation_test/smi_corpus1.txt'
smi_pharm_filepath = 'data/transltation_test/smi_pharm_inchi_corpus1.txt'
smi_inchi_filepath = 'data/transltation_test/smi_inchi_corpus1.txt'
pubchem_corpus_path = 'data/transltation_test/pubchemfp_corpus1.txt'
from tqdm import tqdm
import pandas as pd

s = []
f = open(smi_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    s.append(line)
f.close()

pubfp = []
f = open(pubchem_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    pubfp.append(line)
f.close()

fp = []
f = open(pharm_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    fp.append(line)
f.close()

inchi = []
f = open(inchi_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    inchi.append(line)
f.close()

dfs = pd.DataFrame(s)
dffp = pd.DataFrame(fp)
dfinchi = pd.DataFrame(inchi)
dfpub = pd.DataFrame(pubfp)

data = pd.concat([dfs, dffp,dfinchi,dfpub],axis=1,ignore_index=True)
data.columns = ['smiles','pharmfp','inchi','pubchemfp']
data.to_csv(smi_pharm_filepath,index=False,sep='\t',header=False)
print('Built smi_pharm_inchi translation corpus file!')

71009it [00:00, 1438162.24it/s]
71009it [00:00, 581176.77it/s]
71009it [00:00, 757425.48it/s]
71009it [00:00, 1190338.21it/s]


Built smi_pharm_inchi translation corpus file!


In [57]:
data

Unnamed: 0,smiles,pharmfp,inchi,pubchemfp
0,O = C ( N C C c 1 n c ( - c 2 c c c c c 2 ) c ...,1 2 4 5 7 8 10 11 13 19 20 24 26 28 29 30 31 3...,InChI=1S/ C 18 H 19 N 5 O S / c 24 - 18 ( 17 -...,0 1 2 9 10 14 15 18 143 145 146 150 152 153 17...
1,C O C ( = O ) C ( C C S C ) N C ( = O ) c 1 c ...,1 2 4 5 7 10 11 13 14 24 28 29 31 32 33 49 53 ...,InChI=1S/ C 18 H 23 N 3 O 3 S / c 1 - 12 - 16 ...,0 1 2 9 10 11 14 18 19 33 143 145 146 178 179 ...
2,C C n 1 c ( S C ( C ) C ( = O ) N 2 C C c 3 c ...,0 2 4 5 10 11 13 14 24 26 30 31 32 33 35 64 65...,InChI=1S/ C 19 H 20 N 4 O S 2 / c 1 - 3 - 22 -...,0 1 2 9 10 14 18 33 143 145 146 150 153 157 17...
3,C C [ N H 2 + ] C c 1 c c ( C ( = O ) N 2 C C ...,1 2 4 7 8 10 19 20 28 29 31 40 41 46 47 49 50 ...,InChI=1S/ C 26 H 29 F 3 N 10 O / c 1 - 2 - 30 ...,0 1 2 9 10 11 14 15 18 23 24 129 130 131 132 1...
4,O = c 1 c c c c ( Cl ) n 1 C c 1 n n n n 1 - c...,0 1 2 3 4 5 13 14 24 25 33 34 115 129 130 135 ...,InChI=1S/ C 13 H 10 Cl N 5 O / c 14 - 11 - 7 -...,0 1 18 37 143 145 146 178 179 180 181 185 255 ...
...,...,...,...,...
71004,C C ( = O ) N n 1 o c ( = O ) c ( - c 2 c c c ...,1 2 3 4 5 7 10 11 13 14 16 22 24 25 27 28 29 3...,InChI=1S/ C 19 H 17 N 3 O 6 / c 1 - 9 - 16 ( 1...,0 1 2 9 10 14 18 19 20 115 116 117 118 143 145...
71005,N c 1 c c c c ( - c 2 n c ( N ) c c ( N c 3 c ...,1 4 5 7 8 10 11 13 14 24 25 26 27 28 29 30 31 ...,InChI=1S/ C 16 H 14 Br N 5 / c 17 - 11 - 4 - 2...,0 1 14 15 43 178 179 180 181 185 186 192 255 2...
71006,C C ( = O ) c 1 s c ( - c 2 c c c s 2 ) c c 1 N,4 5 7 10 11 13 14 24 27 28 30 31 33 49 51 52 6...,InChI=1S/ C 10 H 9 N O S 2 / c 1 - 6 ( 12 ) 10...,0 1 9 14 18 143 146 150 153 255 256 257 258 28...
71007,C C ( C ) c 1 c s c ( C C N C ( = O ) C ( = O ...,1 2 4 5 7 10 11 13 14 24 25 26 28 29 30 31 32 ...,InChI=1S/ C 24 H 23 N 3 O 2 S / c 1 - 16 ( 2 )...,0 1 2 9 10 14 18 19 143 145 146 150 152 153 17...
