In [9]:
import copy
import numpy as np
import pandas as pd
import pickle
import tempfile
import urllib
import warnings

from collections import defaultdict
from imp import reload
from mymodule import process_smiles, make_combo_fp, predict_synergy
from progiter import ProgIter
from time import time

warnings.simplefilter('error', FutureWarning)

In [2]:
# read in a datafile containing drug_row_cid and drug_col_cid columns with relevant info

df_name = '/tf/notebooks/code_for_pub/input_files/doses_CssSyn2020_1.csv'
col_names = ['drug_row', 'drug_col', 'drug_row_cid', 'drug_col_cid']
df = pd.read_csv(filepath_or_buffer=df_name, 
                 usecols=col_names,
                 sep='|', 
                 engine='c', 
                 lineterminator='\n', 
                 quotechar='"', 
                 low_memory=False)

In [3]:
# reads in all drugcomb drugs as of October 2020

filename = '/tf/notebooks/code_for_pub/smiles_files/drugcomb_drugs_export_OCT2020.csv'
names=['dname','id','smiles','cid']
smiles = pd.read_csv(filepath_or_buffer=filename, 
                     sep=',', 
                     skiprows=0, 
                     header=0, 
                     names=names, 
                     index_col='id')

In [4]:
# get only drugs found in combinations in your datatfile. 
# use the fact that nan is name for drug_col when it is not a combo (drugcomb-specific info)

drugs_in_combo_by_cid = \
df[~pd.isnull(df['drug_col'])]\
[['drug_row_cid','drug_col_cid']]\
.melt(value_name='cid')\
.drop_duplicates(subset=['cid'])\
.drop(columns=['variable'])\
.sort_values(by='cid')\
.reset_index(drop=True)\
.iloc[:,0]

In [5]:
# get SMILES of those drugs as a dict

smiles_by_cid = smiles.loc[smiles['cid'].isin(drugs_in_combo_by_cid), ['smiles','cid']]\
.set_index(keys='cid', drop=True)\
.iloc[:,0]

In [29]:
# Download ChEMBL_26
#fd = tempfile.NamedTemporaryFile()
#url = 'http://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_26/chembl_26_chemreps.txt.gz'
#urllib.request.urlretrieve(url, fd.name)

# reading in Chembl26 dataset
#df = pd.read_table(fd.name, compression='gzip')

# remove na's. There is a single one CHEMBL1201364
#df = df[~df['canonical_smiles'].isna()]
df = pd.read_csv('/tf/notebooks/code_for_pub/input_files/ChEMBL26.csv')

smiles_ch = df['canonical_smiles']
smiles_ch.reset_index(drop=True, inplace=True)

In [32]:
# combine drugcomb and chembl26 smiles
smiles_all = smiles_ch.append(smiles_by_cid, ignore_index=True).drop_duplicates()

In [34]:
reload(process_smiles)

<module 'mymodule.process_smiles' from '/tf/notebooks/code_for_pub/mymodule/process_smiles.py'>

In [35]:
# preprocess them by standardizing SMILES and removing too long/short SMILES strings using 8-140 cutoff

processor = process_smiles.DataPrep(smiles_all)

#####
final = processor.fin() # outputs series with cid as index and standardizes SMILES as value
#####
final = final.drop_duplicates()

---------------
num_SMILES: 1944338, size cutoffs: [8, 140] 
----start of wash----
----start chop----
remove 52550 SMILES with cut-off [8, 140]
---------------
----final num SMILES: 1891788
---------------


In [None]:
#with open('/tf/notebooks/code_for_pub/smiles_files/smiles_drugcombANDchembl26.pickle', 'wb') as f:
#    pickle.dump(final, f)

In [41]:
smiles_by_cid

cid
3385                                     C1=C(C(=O)NC(=O)N1)F
11960529                  CC1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N
24856436    CC(C)(C1=NC(=CC=C1)N2C3=NC(=NC=C3C(=O)N2CC=C)N...
11977753    CC(C)(C#N)C1=CC=C(C=C1)N2C3=C4C=C(C=CC4=NC=C3N...
387447      B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...
                                  ...                        
9888590     CC(=O)N(C1=CC=CC=C1C=CC2=CC=[N+](C=C2)[O-])S(=...
25182616    CNC(=O)C1=C(C=CC=C1F)NC2=NC(=NC3=C2C=CN3)NC4=C...
53302361    CCN1CCN(CC1)CC2=C(C=C(C=C2)NC(=O)C3=CC(=C(C=C3...
53392493    COC(=O)C=CC(=O)N(CCCCNCC1=CC=C(C=C1)COC(=O)NC2...
57519531    CC1(CN(CC1(C)CO)C2=NC=C(C(=C2)C(=O)NC3=CC4=C(C...
Name: smiles, Length: 4197, dtype: object

In [42]:
processor_dc = process_smiles.DataPrep(smiles_by_cid)

#####
final_dc = processor_dc.fin() # outputs series with cid as index and standardizes SMILES as value
#####


---------------
num_SMILES: 4197, size cutoffs: [8, 140] 
----start of wash----
----start chop----
remove 44 SMILES with cut-off [8, 140]
---------------
----final num SMILES: 4153
---------------


In [49]:
#with open('/tf/notebooks/code_for_pub/smiles_files/smiles_drugcomb_BY_cid_duplicated.pickle', 'wb') as f:
#    pickle.dump(final_dc, f)