In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit import RDLogger    
RDLogger.DisableLog('rdApp.info')
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt,MolLogP,HeavyAtomCount
from rdkit import RDConfig
from rdkit.Chem import ChemicalFeatures
from rdkit.Chem.Pharm2D.SigFactory import SigFactory
from rdkit.Chem.Pharm2D import Generate
from tqdm import tqdm
from multiprocessing import Pool

In [2]:
data_path = 'data/pubchem-10M.txt'
df_ok_path = 'data/pubchem/pubchem_filter1.csv'

In [3]:
smiles = pd.read_csv(data_path,sep='\t',header=None)
smiles.columns = ['smiles']

In [4]:
def evaluate(smile):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return [smile, "INVALID", -999, -999, -999]
    desc_list = [MolWt(mol), MolLogP(mol),HeavyAtomCount(mol)]
    return [smile , "OK"]+desc_list

In [5]:
smiles_list = smiles['smiles'].to_list()
res = list(tqdm(map(evaluate, smiles_list),total= len(smiles_list)))

100%|██████████| 9988594/9988594 [1:29:44<00:00, 1855.16it/s] 


In [6]:
#get molecule properties
df = pd.DataFrame(res)
df = df.iloc[:,:5]

df.columns=["SMILES", "STATE","MW", "LogP","HeavyAtomCount"]
df=df[ ~ df['STATE'].str.contains('INVALID', na=False)] 
print("num_mol_succeed:",len(df))

num_mol_succeed: 9988594


In [7]:
#filter molecules
df_ok = df[
    df.MW.between(*[12, 600]) & # MW
    df.LogP.between(*[-7, 5]) & #LogP
    df.HeavyAtomCount.between(*[3, 50])
    ]
print("符合要求分子数：" , len(df_ok))
df_ok.head()
df_ok.to_csv(df_ok_path,index= False,header=True)

符合要求分子数： 8495825


In [3]:
df_ok = pd.read_csv(df_ok_path)
df_ok

Unnamed: 0,SMILES,STATE,MW,LogP,HeavyAtomCount
0,CN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1,OK,340.423,2.72580,25
1,CC[NH+](CC)C1CCC([NH2+]C2CC2)(C(=O)[O-])C1,OK,241.355,-2.32190,17
2,COCC(CNC(=O)c1ccc2c(c1)NC(=O)C2)OC,OK,278.308,0.57240,20
3,OCCn1cc(CNc2cccc3c2CCCC3)nn1,OK,272.352,1.76130,20
4,O=C(NCc1ccc(F)cc1)N1CC=C(c2c[nH]c3ccccc23)CC1,OK,349.409,4.30590,26
...,...,...,...,...,...
8495820,CC(=O)C(C)Cc1ccc2c(c1)NC(=O)CO2,OK,233.267,1.78510,17
8495821,O=C(Cn1cc(C=C2NC(=O)N(Cc3ccccc3F)C2=O)c2ccccc2...,OK,468.488,4.51200,35
8495822,COc1cc(C(F)(F)F)cc(n2c(C)nc(C#Cc3ccnc(Cl)c3)c2...,OK,405.807,4.96474,28
8495823,O=C(NCc1ccccc1)N1CCC2(CC1)OCCc1c2[nH]c2ccccc12,OK,375.472,3.94150,28


In [4]:
smiles = df_ok
len(smiles)
data_split= np.array_split(smiles, 10)
len(data_split[0])

849583

In [None]:
#2D pharmacophore fingerprint
def getfp(smile):
    mol = Chem.MolFromSmiles(smile)
    try:
        fp = Generate.Gen2DFingerprint(mol,sigFactory)
    except IndexError:
        pass
    else:
        fp_list = list(fp.GetOnBits())
        if fp_list ==[]:
            pass
        else:
        #fp_list=fp_list.astype(int)
            return [smile] + fp_list

fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
featFactory = ChemicalFeatures.BuildFeatureFactory(fdefName)
sigFactory = SigFactory(featFactory,minPointCount=2,maxPointCount=3)
sigFactory.SetBins([(0,2),(2,5),(5,8)])
sigFactory.Init()
sigFactory.GetSigSize()

2988

In [None]:
#save the molecules with 2D pharmacophore fingerprint
for i in range(10):
    mols = data_split[i]['SMILES'].values.tolist()
    fp = list(tqdm(map(getfp,mols),total=len(data_split[i]), desc='%d 监视进度:' %(i,)))

    fp_filter_None = []
    for item in fp:
        if item != None :
            fp_filter_None.append(item)

    with open('fp_list_1_%d.txt' %i,'w') as f:
        for i in fp_filter_None:
            for j in i:
                f.write(str(j))
                f.write(' ')
            f.write('\n')
        f.close()

0 监视进度:: 100%|██████████| 849583/849583 [1:06:26<00:00, 213.12it/s]
1 监视进度:: 100%|██████████| 849583/849583 [1:06:26<00:00, 213.09it/s]
2 监视进度::  39%|███▉      | 334654/849583 [26:07<46:22, 185.03it/s]  

In [None]:
# read the molecules
fp=[]
for i in range(10):
    file=open('fp_list_1_%d.txt' %i, mode='r',encoding='UTF-8')
    contents = file.readlines()
    for msg in tqdm(contents,desc='进度： %s' %i):
        msg = msg.strip('\n')
        adm = msg.split(' ')
        fp.append(adm)
    file.close()

进度： 0: 100%|██████████| 700800/700800 [00:04<00:00, 145756.26it/s]
进度： 1: 100%|██████████| 700682/700682 [00:04<00:00, 149527.70it/s]
进度： 2: 100%|██████████| 701006/701006 [00:06<00:00, 113806.18it/s]
进度： 3: 100%|██████████| 700999/700999 [00:07<00:00, 93174.06it/s] 
进度： 4: 100%|██████████| 701072/701072 [00:43<00:00, 16205.68it/s] 
进度： 5: 100%|██████████| 700529/700529 [02:00<00:00, 5801.52it/s]  
进度： 6: 100%|██████████| 701666/701666 [03:57<00:00, 2950.21it/s]  
进度： 7: 100%|██████████| 700331/700331 [00:04<00:00, 143180.70it/s]
进度： 8: 100%|██████████| 700276/700276 [06:12<00:00, 1880.47it/s]  
进度： 9: 100%|██████████| 700724/700724 [00:04<00:00, 168621.91it/s]


2D pharmacophore fingerprint

In [None]:
def split(sm):
    '''
    function: Split SMILES into words. Care for Cl, Br, Si, Se, Na etc.
    input: A SMILES
    output: A string with space between words
    '''
    arr = []
    i = 0
    while i < len(sm)-1:
        if not sm[i] in ['%', 'C', 'B', 'S', 'N', 'R', 'X', 'L', 'A', 'M', \
                        'T', 'Z', 's', 't', 'H', '+', '-', 'K', 'F']:
            arr.append(sm[i])
            i += 1
        elif sm[i]=='%':
            arr.append(sm[i:i+3])
            i += 3
        elif sm[i]=='C' and sm[i+1]=='l':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='C' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='C' and sm[i+1]=='u':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='B' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='S' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='N' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='N' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='R' and sm[i+1]=='b':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='R' and sm[i+1]=='a':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='X' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='L' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='l':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='s':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='g':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='A' and sm[i+1]=='u':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='M' and sm[i+1]=='g':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='M' and sm[i+1]=='n':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='T' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='Z' and sm[i+1]=='n':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='s' and sm[i+1]=='i':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='s' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='t' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='H' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='2':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='3':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='+' and sm[i+1]=='4':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='2':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='3':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='-' and sm[i+1]=='4':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='K' and sm[i+1]=='r':
            arr.append(sm[i:i+2])
            i += 2
        elif sm[i]=='F' and sm[i+1]=='e':
            arr.append(sm[i:i+2])
            i += 2
        else:
            arr.append(sm[i])
            i += 1
    if i == len(sm)-1:
        arr.append(sm[i])
    return ' '.join(arr) 

In [None]:
pharm_corpus_path = 'data/pubchem/pharm_corpus1.txt'
smi_corpus_path = 'data/pubchem/smi_corpus1.txt'
smi_pharm_filepath = 'data/pubchem/smi_pharm_corpus1.txt'
#from utils import split
with open(pharm_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(fp))):
        if fp[i] is None:
            pass
        else:
            sm = fp[i][0]
            word = str(fp[i][1:])
            word  = word.replace("'", "")
            word  = word.replace(",", "")
            word  = word.replace("[", "")
            word  = word.replace("]", "")
        #word
            f1.write(word + '\n')
            f2.write(split(sm) +'\n')
print('Built pharm & smi corpus file!')

100%|██████████| 7008085/7008085 [06:19<00:00, 18448.21it/s]


Built pharm & smi corpus file!


InChI

In [None]:
def get_inchi(smile):
    mol = Chem.MolFromSmiles(smile)
    inchi = Chem.MolToInchi(mol)
    return smile +' ' + inchi

In [None]:
from tqdm import tqdm
fp=[]
with open ('smiles.txt' ,'w')as f:
    for i in range(10):
        file=open('data/pubchem/fp_list_1/fp_list_1_%d.txt' %i, mode='r',encoding='UTF-8')
        contents = file.readlines()   
        for msg in tqdm(contents,desc='进度： %s' %i):
            msg = msg.strip('\n')
            adm = msg.split(' ')
            fp.append(adm[0])
            f.write(adm[0]+'\n')
        file.close()

进度： 0: 100%|██████████| 700800/700800 [00:02<00:00, 299875.01it/s]
进度： 1: 100%|██████████| 700682/700682 [00:02<00:00, 314870.35it/s]
进度： 2: 100%|██████████| 701006/701006 [00:02<00:00, 321122.57it/s]
进度： 3: 100%|██████████| 700999/700999 [00:02<00:00, 327897.92it/s]
进度： 4: 100%|██████████| 701072/701072 [00:02<00:00, 328174.16it/s]
进度： 5: 100%|██████████| 700529/700529 [00:02<00:00, 332551.92it/s]
进度： 6: 100%|██████████| 701666/701666 [00:02<00:00, 328758.11it/s]
进度： 7: 100%|██████████| 700331/700331 [00:02<00:00, 328092.88it/s]
进度： 8: 100%|██████████| 700276/700276 [00:02<00:00, 322529.97it/s]
进度： 9: 100%|██████████| 700724/700724 [00:02<00:00, 326977.33it/s]


In [5]:
inchi = list(tqdm(map(get_inchi,fp),total=len(fp), desc='%d 监视进度:' ))
inchi_filter_None = []
for item in inchi:
    if item != None :
        inchi_filter_None.append(item)

with open('inchi_list.txt','w') as f:
    for i in inchi_filter_None:
        f.write(i)
        f.write('\n')
    f.close()

%d 监视进度:: 100%|██████████| 7008085/7008085 [45:59<00:00, 2539.28it/s]


In [2]:
inchi_df = pd.read_csv('inchi_list.txt', sep=' ', header = None)
inchi_df

Unnamed: 0,0,1
0,CN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1,InChI=1S/C20H24N2O3/c1-22(16-7-3-2-4-8-16)18-1...
1,CC[NH+](CC)C1CCC([NH2+]C2CC2)(C(=O)[O-])C1,InChI=1S/C13H24N2O2/c1-3-15(4-2)11-7-8-13(9-11...
2,COCC(CNC(=O)c1ccc2c(c1)NC(=O)C2)OC,InChI=1S/C14H18N2O4/c1-19-8-11(20-2)7-15-14(18...
3,OCCn1cc(CNc2cccc3c2CCCC3)nn1,InChI=1S/C15H20N4O/c20-9-8-19-11-13(17-18-19)1...
4,O=C(NCc1ccc(F)cc1)N1CC=C(c2c[nH]c3ccccc23)CC1,InChI=1S/C21H20FN3O/c22-17-7-5-15(6-8-17)13-24...
...,...,...
7008080,Cc1cc(C(=O)NCC([NH3+])C2CC2)ccc1n1cncn1,InChI=1S/C15H19N5O/c1-10-6-12(4-5-14(10)20-9-1...
7008081,CCCn1ncc(NC(CC)CC)c(Cl)c1=O,InChI=1S/C12H20ClN3O/c1-4-7-16-12(17)11(13)10(...
7008082,O=C(Cn1cc(C=C2NC(=O)N(Cc3ccccc3F)C2=O)c2ccccc2...,InChI=1S/C27H21FN4O3/c28-22-12-6-4-8-18(22)16-...
7008083,COc1cc(C(F)(F)F)cc(n2c(C)nc(C#Cc3ccnc(Cl)c3)c2...,InChI=1S/C20H15ClF3N3O/c1-12-18(5-4-14-6-7-25-...


In [12]:
def split_inchi(inchi):
    '''
    function: Split inchi into words. Care for Cl, Br, Si, Se, Na etc.
    input: A inchi
    output: A string with space between words
    '''
    arr = []
    i = 0
    while i < len(inchi)-1:
        if not inchi[i] in ['%', 'C', 'B', 'S', 'N', 'R', 'X', 'L', 'A', 'M', \
                        'T', 'Z', 's', 't', 'H', '+', 'K', 'F','I','1','2','3','4']:
            arr.append(inchi[i])
            i += 1
        elif inchi[i]=='%':
            arr.append(inchi[i:i+3])
            i += 3
        elif inchi[i]=='C' and inchi[i+1]=='l':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='C' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='C' and inchi[i+1]=='u':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='B' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='S' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='N' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='N' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='R' and inchi[i+1]=='b':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='R' and inchi[i+1]=='a':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='X' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='L' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='l':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='s':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='g':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='A' and inchi[i+1]=='u':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='M' and inchi[i+1]=='g':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='M' and inchi[i+1]=='n':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='T' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='Z' and inchi[i+1]=='n':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='s' and inchi[i+1]=='i':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='s' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='t' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='H' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='2':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='3':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='+' and inchi[i+1]=='4':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='K' and inchi[i+1]=='r':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='F' and inchi[i+1]=='e':
            arr.append(inchi[i:i+2])
            i += 2
        elif inchi[i]=='I' and inchi[i+1]=='n' and inchi[i+2]=='C':
            arr.append(inchi[i:i+9])
            i += 9
        elif inchi[i]=='1' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2   
        elif inchi[i]=='2' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2   
        elif inchi[i]=='3' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2 
        elif inchi[i]=='4' and (inchi[i+1]=='0'or inchi[i+1]=='1' or inchi[i+1]=='2' or inchi[i+1]=='3'or inchi[i+1]=='4'or inchi[i+1]=='5'or inchi[i+1]=='6'or inchi[i+1]=='7'or inchi[i+1]=='8'or inchi[i+1]=='9'):
            arr.append(inchi[i:i+2])
            i += 2 
        else: 
            arr.append(inchi[i])
            i += 1
    if i == len(inchi)-1:
        arr.append(inchi[i])
    return ' '.join(arr) 

In [17]:
inchi = []
with open('inchi_list.txt','r') as file:
    contents = file.readlines()
    for msg in tqdm(contents):
        msg = msg.strip('\n')
        adm = msg.split(' ')
        inchi.append(adm)


100%|██████████| 7008085/7008085 [00:09<00:00, 753110.71it/s] 


In [None]:
inchi_corpus_path = 'data/pubchem/inchi_corpus1.txt'
smi_corpus_path = 'data/pubchem/smi_corpus2.txt'
smi_inchi_filepath = 'data/pubchem/smi_inchi_corpus1.txt'
#from utils import split
with open(inchi_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(inchi))):
        if inchi[i] is None:
            pass
        else:
            sm = inchi[i][0]
            word = inchi[i][1]
            # word  = word.replace("'", "")
            # word  = word.replace(",", "")
            # word  = word.replace("[", "")
            # word  = word.replace("]", "")
        #word
            f1.write(split_inchi(word) + '\n')
            f2.write(split(sm) +'\n')
print('Built inchi & smi corpus file!')

100%|██████████| 7008085/7008085 [15:40<00:00, 7452.84it/s]

Built inchi & smi corpus file!





PubChem

In [None]:
from pubchemfp import GetPubChemFPs
import numpy as np
from rdkit import Chem
from tqdm import tqdm

[18:30:21] Enabling RDKit 2019.09.3 jupyter extensions


In [None]:
def GetPubChemFpBits(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol2 = Chem.AddHs(mol)
    try:
        result = GetPubChemFPs(mol2)
    except:
        pass
    else:
        bit = []
        for i in range(881):
            if result[i] == True:
                bit.append(i)
        return [smiles] + bit

In [None]:

fp=[]
with open ('smiles.txt' ,'w')as f:
    for i in range(10):
        file=open('data/pubchem/fp_list_1/fp_list_1_%d.txt' %i, mode='r',encoding='UTF-8')
        contents = file.readlines()   
        for msg in tqdm(contents,desc='进度： %s' %i):
            msg = msg.strip('\n')
            adm = msg.split(' ')
            fp.append(adm[0])
            f.write(adm[0]+'\n')
        file.close()

进度： 0: 100%|██████████| 700800/700800 [00:02<00:00, 336358.61it/s]
进度： 1: 100%|██████████| 700682/700682 [00:02<00:00, 321683.42it/s]
进度： 2: 100%|██████████| 701006/701006 [00:02<00:00, 341673.25it/s]
进度： 3: 100%|██████████| 700999/700999 [00:02<00:00, 331333.40it/s]
进度： 4: 100%|██████████| 701072/701072 [00:02<00:00, 344526.32it/s]
进度： 5: 100%|██████████| 700529/700529 [00:02<00:00, 333365.28it/s]
进度： 6: 100%|██████████| 701666/701666 [00:02<00:00, 341385.47it/s]
进度： 7: 100%|██████████| 700331/700331 [00:02<00:00, 340691.68it/s]
进度： 8: 100%|██████████| 700276/700276 [00:02<00:00, 338551.56it/s]
进度： 9: 100%|██████████| 700724/700724 [00:02<00:00, 338742.61it/s]


In [None]:
smiles = fp
len(smiles)
data_split= np.array_split(smiles, 10)
len(data_split[0])

700809

In [None]:
for i in range(10):
    mols = data_split[i].tolist()
    fp = list(tqdm(map(GetPubChemFpBits,mols),total=len(data_split[i]), desc='%d 监视进度:' %(i,)))

    fp_filter_None = []
    for item in fp:
        if item != None :
            fp_filter_None.append(item)

    with open('pubfp/fp_list_%d.txt' %i,'w') as f:
        for i in fp_filter_None:
            for j in i:
                f.write(str(j))
                f.write(' ')
            f.write('\n')
        f.close()

0 监视进度:: 100%|██████████| 700809/700809 [1:09:52<00:00, 167.15it/s]
1 监视进度:: 100%|██████████| 700809/700809 [1:10:05<00:00, 166.66it/s]
2 监视进度:: 100%|██████████| 700809/700809 [1:09:02<00:00, 169.16it/s]
3 监视进度:: 100%|██████████| 700809/700809 [1:07:58<00:00, 171.84it/s]
4 监视进度::  37%|███▋      | 259332/700809 [25:12<42:48, 171.87it/s]  

In [None]:
pub_fp=[]
for i in range(10):
    file=open('pubfp/fp_list_%d.txt' %i, mode='r',encoding='UTF-8')
    contents = file.readlines()
    for msg in tqdm(contents,desc='进度： %s' %i):
        msg = msg.strip('\n')
        adm = msg.split(' ')
        pub_fp.append(adm)
    file.close()

进度： 0: 100%|██████████| 700809/700809 [00:07<00:00, 95878.39it/s] 
进度： 1: 100%|██████████| 700809/700809 [00:24<00:00, 28257.70it/s] 
进度： 2: 100%|██████████| 700809/700809 [01:08<00:00, 10250.12it/s] 
进度： 3: 100%|██████████| 700809/700809 [02:16<00:00, 5120.08it/s]  
进度： 4: 100%|██████████| 700809/700809 [03:45<00:00, 3112.25it/s]  
进度： 5: 100%|██████████| 700808/700808 [06:25<00:00, 1815.85it/s] 
进度： 6: 100%|██████████| 700808/700808 [09:21<00:00, 1247.57it/s]  
进度： 7: 100%|██████████| 700808/700808 [00:09<00:00, 76758.33it/s] 
进度： 8: 100%|██████████| 700808/700808 [15:38<00:00, 746.41it/s]   
进度： 9: 100%|██████████| 700808/700808 [00:10<00:00, 67696.48it/s] 


In [None]:
pubchem_corpus_path = 'data/pubchem/pubchemfp_corpus1.txt'
smi_corpus_path = 'data/pubchem/smi_corpus3.txt'
smi_pubchem_filepath = 'data/pubchem/smi_pubchemfp_corpus1.txt'
#from utils import split
with open(pubchem_corpus_path, 'w') as f1, open(smi_corpus_path, 'w') as f2:
    for i in tqdm(range(len(pub_fp))):
        if pub_fp[i] is None:
            pass
        else:
            sm = pub_fp[i][0]
            word = str(pub_fp[i][1:])
            word  = word.replace("'", "")
            word  = word.replace(",", "")
            word  = word.replace("[", "")
            word  = word.replace("]", "")
        #word
            f1.write(word + '\n')
            f2.write(split(sm) +'\n')
print('Built pharm & smi corpus file!')

100%|██████████| 7008085/7008085 [11:50<00:00, 9865.77it/s] 

Built pharm & smi corpus file!





Build smi_pharm_inchi_pubchem_corpus

In [None]:
pharm_corpus_path = 'data/pubchem/pharm_corpus1.txt'
smi_corpus_path = 'data/pubchem/smi_corpus1.txt'
smi_pharm_filepath = 'data/pubchem/smi_pharm_inchi_pubchem_corpus2.txt'
inchi_corpus_path = 'data/pubchem/inchi_corpus2.txt'
pubchem_corpus_path = 'data/pubchem/pubchemfp_corpus1.txt'

s = []
f = open(smi_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    s.append(line)
f.close()

pubfp = []
f = open(pubchem_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    pubfp.append(line)
f.close()

fp = []
f = open(pharm_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    fp.append(line)
f.close()

inchi = []
f = open(inchi_corpus_path,'r')
for line in tqdm(f):
    line = line.strip('\n')
    inchi.append(line)
f.close()

dfs = pd.DataFrame(s)
dffp = pd.DataFrame(fp)
dfinchi = pd.DataFrame(inchi)
dfpub = pd.DataFrame(pubfp)

data = pd.concat([dfs, dffp,dfinchi,dfpub],axis=1,ignore_index=True)
data.columns = ['smiles','pharmfp','inchi','pubchemfp']
data.to_csv(smi_pharm_filepath,index=False,sep='\t',header=False)
print('Built smi_pharm_inchi translation corpus file!')

7008085it [00:04, 1528573.34it/s]
7008085it [00:15, 438007.71it/s]
7008085it [00:09, 712091.74it/s]
7008085it [00:07, 910260.66it/s]


Built smi_pharm_inchi translation corpus file!


In [8]:
import BuildVocab

pharm_vocab_path = 'data/pubchem/pharm_vocab1.pkl'
smi_vocab_path = 'data/pubchem/smi_vocab1.pkl'

BuildVocab.built_vocab(pharm_corpus_path,pharm_vocab_path)
BuildVocab.built_vocab(smi_corpus_path,smi_vocab_path)

Building Vocab
VOCAB SIZE: 2260
Vocab build done! 
Building Vocab
VOCAB SIZE: 87
Vocab build done! 


In [2]:
inchi_vocab_path = 'data/pubchem/inchi_vocab1.pkl'
inchi_corpus_path = 'data/pubchem/inchi_corpus1.txt'
BuildVocab.built_vocab(inchi_corpus_path,inchi_vocab_path)

Building Vocab
VOCAB SIZE: 124
Vocab build done! 


In [None]:

pubfp_vocab_path = 'data/pubchem/pubfp_vocab1.pkl'
pubfp_corpus_path = 'data/pubchem/pubchemfp_corpus1.txt'
BuildVocab.built_vocab(pubfp_corpus_path,pubfp_vocab_path)

Building Vocab


In [None]:
from BuildVocab import WordVocab 
pubfp_vocab_path = 'data/pubchem/pubfp_vocab1.pkl'
src_vocab = WordVocab.load_vocab(pubfp_vocab_path)
len(src_vocab)

842