## Get Pubchem drug feautures

1. finding corresponding Pubchem ids for the drugs 
2. call Pubchem to get chemical properties of the drugs
3. Preprocess text Drug description from the original datasets
4. Preprocess some text characteristics from PubChem properties

In [46]:
import pandas as pd
import numpy as np
import os
# pip install PubChemPy
import pubchempy as pcp
import re
from pubchempy import Compound
import warnings
warnings.filterwarnings("ignore")
import time
from tqdm import tqdm
import pickle

_FOLDER = "data/"

In [47]:
os.listdir(_FOLDER)

['drugs_gdsc2.csv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.tsv',
 'Cell_Lines_Details.csv',
 'drugs_gdsc1.csv',
 'Cell_Line_Features_PANCAN_simple_MOBEM.xlsx',
 'Drug_Features.csv',
 'normalised_dose_response_data.csv']

In [69]:
drug_features = pd.read_csv(_FOLDER + "Drug_Features.csv").rename(columns={"Drug ID": "DRUG_ID", 
                                                                           "Drug Name": "Drug_Name",
                                                                          "Target Pathway": "Target_Pathway"})
#drug_features.set_index("DRUG_ID", inplace= True)
df = pd.read_csv(_FOLDER+'Drug_Features.csv')
ind_wrong_name = df[df["Drug Name"]=="Lestauritinib"].index
df.loc[ind_wrong_name, "Drug Name"] = "Lestaurtinib"

drug_features.head()

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
3,6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


## Part 1: Get PubChem ids

In [70]:
#the most recent data on drugs properties
#https://www.cancerrxgene.org/downloads/anova?screening_set=GDSC1

drugs_1 = pd.read_csv(_FOLDER+'drugs_gdsc1.csv')

In [71]:
def GetPubChemId(df, gdsc, new_column_name = "pubchem"):
    """Take pubchem id for all the drugs"""
    
    manual_corrections_dict= {
    #Camptothecin == SN38, pubchem id: 104842 
     "Camptothecin" : "104842",
    #from gdsc2: WIKI4	WIKI-4, WIKI 4	WNT signaling	TNKS1, TNKS2	2984337
     "WIKI4": "2984337",
    #from gdsc2: SB505124	SB 505124, SB505124	RTK signaling	TGFBR1, ACVR1B, ACVR1C	9858940
        "SB-505124" : "9858940",
    #from gdsc1: drug_name	synonyms	pathway_name	targets	pubchem
    #Wee1 Inhibitor	681640, Wee1 Inhibitor	Cell cycle	WEE1, CHEK1	10384072
        "681640": "10384072",
     #BX796 https://mpegs-1.com/index.php/2017/12/18/Celecoxib/ 73051434
       #???? "BX796" : "73051434",
    #https://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot1/combo/GDSC_drugs
    # matches BX796 to BX795 with pubchemid 10077147 https://github.com/ECP-CANDLE/Benchmarks
    "BX796" : "10077147"
    }
    
    not_found = []
    for ind in df["Drug_Name"].index:
        name = df.loc[ind, "Drug_Name"]
        drug = gdsc[gdsc["drug_name"]== name]
        if drug.shape[0]==1:
            df.loc[ind, new_column_name]= drug["pubchem"].values[0]
        elif drug.shape[0]==0:
            not_found.append(name)
        else:
            if drug["pubchem"].values[0] != "-":
                df.loc[ind, new_column_name]= drug["pubchem"].values[0]
            else: 
                df.loc[ind, new_column_name]= sorted(drug["pubchem"].values, reverse=True)[0]
    not_found2 = []
    for name in not_found:
        if name in manual_corrections_dict:
            df.loc[ind, new_column_name]= manual_corrections_dict[name]
        else:
            not_found2.append(name)
    
    return df, not_found2

In [72]:
drug_features, not_found = GetPubChemId(drug_features, drugs_1, new_column_name = "pubchem_id")
drug_features[drug_features["pubchem_id"]=="several"]

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,pubchem_id
6,17,Cyclopamine,-,SMO,Other,several
50,153,Midostaurin,"PKC412, benzoylstaurosporine, CGP-41251","PKC, PPK, FLT1, c-FGR, others",Other,several
76,190,Bleomycin,-,dsDNA break induction,DNA replication,several


### Preprocess drugs with several pubchem ids

In [73]:

# Cyclopamine (DRUG_ID=17) => 423 profiles
# Midostaurin (DRUG_ID=153) => 941 profiles
# Bleomycin (DRUG_ID=190) => 932 profiles

#manual corrections:
correct_dict = {"Bleomycine": {"pubchem_id" : 72467,
                               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
                "Bleomycin (50 uM)": {"pubchem_id" :5460769},
                
                "Cyclopamine":  {"pubchem_id": 442972,
                                 "reference" : "https://www.sciencedirect.com/science/article/abs/pii/S0014299915300625"
                                },
                "Midostaurin":  {"pubchem_id": 9829523,
                                 "reference" : "https://link.springer.com/article/10.1007/s13205-019-1594-y"
                                }
               }

for drug in correct_dict:
    drug_features.loc[drug_features["Drug_Name"] == drug, "pubchem_id"] = correct_dict[drug]["pubchem_id"]

In [74]:
drug_features["pubchem_id"] = drug_features["pubchem_id"].astype("int64") 

ValueError: invalid literal for int() with base 10: 'none'

### Preprocess drugs with pubchem ids == none

In [75]:
drug_features[drug_features["pubchem_id"]=="none"].shape

(24, 6)

In [76]:
drug_features[drug_features["pubchem_id"]=="none"]["Drug_Name"]

58                 JQ12
92             TL-2-105
98     Genentech Cpd 10
103                 FMK
104           QL-XII-47
108              WZ3105
109            XMD14-99
112           JW-7-24-1
113       NPK76-II-72-1
116             TL-1-85
134          KIN001-236
158            QL-XI-92
159             XMD13-2
160            QL-X-138
161            XMD15-27
164            THZ-2-49
166         THZ-2-102-1
225         HG-5-113-01
226          HG-5-88-01
228           XMD11-85h
229               ZG-10
231          QL-VIII-58
237           QL-XII-61
248              rTRAIL
Name: Drug_Name, dtype: object

In [77]:
drug_features[drug_features["Drug_Name"] == "Obatoclax Mesylate"]

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,Target_Pathway,pubchem_id
72,182,Obatoclax Mesylate,"GX15-070MS, Obatoclax, GX15-070","BCL2, BCL-XL, BCL-W, MCL1",Apoptosis regulation,11404337


In [78]:
# Manual matching for drugs with missing or mutiple data

def RunManualCorrections(df_drugs, drop_not_found_drugs = False):
    new_synonyms = {"Y-39983": {"Synonyms": "Y-33075",
                           "reference": ["https://www.medchemexpress.com/Y-33075.html",
                            "https://www.nature.com/articles/s41467-019-13781-3"]}}

    manual_corrections = {
  
    "WZ-1-84": {"pubchem_id" : 49821040,
               "reference" : "http://lincs.hms.harvard.edu/db/datasets/20119/smallmolecules"},
    
    "Bleomycine": {"pubchem_id" : 72467,
               "reference" : "https://www.drugbank.ca/drugs/DB00290"},
    
    "Y-39983": {"pubchem_id" : 20601328,
               "reference" : "https://www.medchemexpress.com/Y-33075.html"},
    
    "JW-7-52-1": {"pubchem_id" : 20822503,
               "reference" : "https://pharmacodb.ca/drugs/392"},
    
    "VNLG/124": { "pubchem_id": 24894414, 
                  "reference": "https://www.cancerrxgene.org/compounds" },
    
    "PDK1 inhibitor 7": { "pubchem_id": 56965967, 
                         "reference": "https://www.cancerrxgene.org/compounds"},
    
    "KIN001-260": {"pubchem_id": 10451420, 
                   "reference": "https://www.cancerrxgene.org/compounds"},
    
    "SB52334": {"pubchem_id": 9967941, 
                "reference": "https://www.cancerrxgene.org/compounds"},

    
    "THZ-2-102-1" : {"pubchem_id": 146011539, 
                   "reference": "Katjusa Koler's suggestion"},
    
    "THZ-2-49" : {"pubchem_id": 78357763 , 
                   "reference": ["https://www.cancerrxgene.org/compounds", 
                                "https://www.medchemexpress.com/THZ2.html",
                                "https://pubchem.ncbi.nlm.nih.gov/compound/78357763"]},
    
    "QL-XII-47": {"pubchem_id": 71748056, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10077-101-1/"},
    
    "BMS-345541" : {"pubchem_id": 9813758, 
                   "reference": ""},
    
    "SB590885" : {"pubchem_id": 135398506, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=SB590885"},
    
    "WZ3105" : {"pubchem_id": 42628507, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10084-101/"},
    
    "NPK76-II-72-1" : {"pubchem_id": 46843648, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10070-101/"},
    
    "JW-7-24-1" : {"pubchem_id": 69923936, 
                   "reference": "https://lincs.hms.harvard.edu/db/sm/10019-101/"},
    "Bryostatin 1" : {"pubchem_id": 6435419, 
                   "reference": "https://pubchem.ncbi.nlm.nih.gov/#query=Bryostatin%201"},
    "QL-XI-92": {"pubchem_id": 73265214,
                 "reference": "Katjusa Koler's & Dennis Wang's database"},
    
    "SL0101": {"pubchem_id": 10459196,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "Z-LLNle-CHO": {"pubchem_id": 16760646  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "JNK-9L": {"pubchem_id": 25222038  ,
                 "reference": "https://www.cancerrxgene.org/compounds"}, 
    "KIN001-244": {"pubchem_id": 56965967  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "RO-3306":  {"pubchem_id": 44450571  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},
    "EHT-1864": {"pubchem_id": 9938202  ,
                 "reference": "https://www.cancerrxgene.org/compounds"},  
    }
    
    corrections_pubchem_id = {
    "Temsirolimus": 6918289,
    "Vinorelbine": 5311497,
    "Y-39983": 9810884,
    "GW441756": 9943465, 
    "Vinblastine": 6710780,
    "Bryostatin 1": 5280757,
    "Parthenolide": 7251185,
    "Obatoclax Mesylate": 11404337,
    "Bleomycin (50 uM)": 5460769,
    "SB590885": 11316960,
    "Paclitaxel" :36314,
    "BMS-345541": 9813758,
    "YM201636" :  9956222, 
    }
    
    if "DRUG_ID" in df_drugs.columns:
        df_drugs.set_index("DRUG_ID", inplace=True)
    
    not_identified_drugs = {}

    for ind in df_drugs[df_drugs["pubchem_id"]=="none"].index:
            drug_name = df_drugs.loc[ind, "Drug_Name"]
    
            if drug_name in corrections_pubchem_id:
                df_drugs.loc[ind, "pubchem_id"] = corrections_pubchem_id[drug_name]
            elif drug_name in manual_corrections:
                df_drugs.loc[ind, "pubchem_id"]  = manual_corrections[drug_name]["pubchem_id"]
            else:
                not_identified_drugs[ind] = drug_name
                
    print("Total number of drugs:", df_drugs["Drug_Name"].nunique())
    print("Number of not found drugs:", len(not_identified_drugs))
    
    if drop_not_found_drugs:
        df_drugs.drop(list(not_identified_drugs.keys(), axis=1, inplace=True))
        
    return df_drugs, not_identified_drugs

In [79]:
drug_features, not_identified_drugs = RunManualCorrections(drug_features, drop_not_found_drugs = False)

Total number of drugs: 250
Number of not found drugs: 17


## Part 2 :Getting properties by PubChem API

In [7]:
def GetPubchemProperties(df_drug_properties):
    """df_drug_properties must be with index = drug_id
    pubchem_ids_dict should have structure:
    pubchem_ids_dict[drug_id]["PubChem_ID"] """
    
    potential_drug_to_drop =[]
    for drug_id in tqdm(pubchem_ids_dict):
        try:
            PubChem_id = int(pubchem_ids_dict[drug_id]["PubChem_ID"][0])
            c = Compound.from_cid(PubChem_id)
        
            df_drug_properties.loc[drug_id, "molecular_weight"] = c.molecular_weight
   
            df_drug_properties.loc[drug_id, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
            bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
            df_drug_properties.loc[drug_id, "2bonds"] = bonds.count(2)
            df_drug_properties.loc[drug_id, "3bonds"] = bonds.count(3)

            df_drug_properties.loc[drug_id, "xlogp"] = c.xlogp
            df_drug_properties.loc[drug_id, "formal_charge"] = c.charge
    
            df_drug_properties.loc[drug_id, "surface_area"] = c.tpsa

            df_drug_properties.loc[drug_id, "complexity"] = c.complexity

            df_drug_properties.loc[drug_id, "h_bond_donor_count"] = c.h_bond_donor_count

            df_drug_properties.loc[drug_id, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

            df_drug_properties.loc[drug_id, "rotatable_bond_count"] = c.rotatable_bond_count

            df_drug_properties.loc[drug_id, "heavy_atom_count"] = c.heavy_atom_count

            df_drug_properties.loc[drug_id, "atom_stereo_count"] = c.atom_stereo_count

            df_drug_properties.loc[drug_id, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

            df_drug_properties.loc[drug_id, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

            df_drug_properties.loc[drug_id, "bond_stereo_count"] = c.bond_stereo_count

            df_drug_properties.loc[drug_id, "covalent_unit_count"] = c.covalent_unit_count
            df_drug_properties.loc[drug_id, "molecular_formula"] = c.molecular_formula

            df_drug_properties.loc[drug_id, "canonical_smiles"] = c.canonical_smiles

            df_drug_properties.loc[drug_id, "inchi_string"] = c.inchi

            df_drug_properties.loc[drug_id, "inchi_key"] = c.inchikey
        except:
#             print("Error with drug:", drug_id)
            potential_drug_to_drop.append(drug_id)
            pass
        return df_drug_properties, drugs_to_drop

In [9]:
%%time
# not working....
drug_features2, drugs_to_drop = GetPubchemProperties(drug_features, drugs_with_pubchem_id)
drugs_to_drop

  0%|          | 0/250 [00:00<?, ?it/s]

CPU times: user 22.5 ms, sys: 285 µs, total: 22.8 ms
Wall time: 1.36 s





In [10]:
%%time
drugs_to_drop = []
for drug_id in tqdm(drugs_with_pubchem_id):
    try:
        PubChem_id = int(drugs_with_pubchem_id[drug_id]["PubChem_ID"][0])
        c = Compound.from_cid(PubChem_id)
        
        drug_features.loc[drug_id, "molecular_weight"] = c.molecular_weight
   
        drug_features.loc[drug_id, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_id, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_id, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_id, "xlogp"] = c.xlogp
        drug_features.loc[drug_id, "formal_charge"] = c.charge
    
        drug_features.loc[drug_id, "surface_area"] = c.tpsa

        drug_features.loc[drug_id, "complexity"] = c.complexity

        drug_features.loc[drug_id, "h_bond_donor_count"] = c.h_bond_donor_count

        drug_features.loc[drug_id, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_id, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_id, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_id, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_id, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_id, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_id, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_id, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_id, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_id, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_id, "inchi_string"] = c.inchi

        drug_features.loc[drug_id, "inchi_key"] = c.inchikey
    except:
        print("Error with drug:", drug_id)
        drugs_to_drop.append(drug_id)
#         drug_features.drop(drug_id, inplace = True)
        pass

 10%|▉         | 24/250 [01:17<39:42, 10.54s/it]

Error with drug: 60


 14%|█▍        | 36/250 [02:51<41:11, 11.55s/it]

Error with drug: 104


100%|██████████| 250/250 [12:07<00:00,  1.53s/it]

CPU times: user 5.44 s, sys: 272 ms, total: 5.71 s
Wall time: 12min 7s





In [30]:
for drug in drugs_to_drop:
    print(drug, all_drugs_names[drug])
drug_features.loc[drugs_to_drop, :]

60 BI-2536
104 Bortezomib


Unnamed: 0_level_0,Drug_Name,Synonyms,Target,Target_Pathway,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,...,heavy_atom_count,atom_stereo_count,defined_atom_stereo_count,undefined_atom_stereo_count,bond_stereo_count,covalent_unit_count,molecular_formula,canonical_smiles,inchi_string,inchi_key
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60,BI-2536,-,"PLK1, PLK2, PLK3",Cell cycle,521.7,"'H', 'O', 'N', 'C'",8.0,0.0,3.7,0.0,...,38.0,1.0,1.0,0.0,0.0,1.0,C28H39N7O3,CCC1C(=O)N(C2=CN=C(N=C2N1C3CCCC3)NC4=C(C=C(C=C...,InChI=1S/C28H39N7O3/c1-5-22-27(37)34(3)23-17-2...,XQVVPGYIWAGRNI-JOCHJYFZSA-N
104,Bortezomib,"PS-341, LDP-341, Velcade",Proteasome,Protein stability and degradation,384.2,"'N', 'H', 'B', 'C', 'O'",8.0,0.0,,0.0,...,28.0,2.0,2.0,0.0,0.0,1.0,C19H25BN4O4,B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN...,InChI=1S/C19H25BN4O4/c1-13(2)10-17(20(27)28)24...,GXJABQQUPOEUTA-RDJZCZTQSA-N


In [31]:
potential_drug_to_drop = []
for drug_id in drug_features[drug_features["molecular_weight"].isnull()].index:
    try:
        PubChem_id= drugs_with_pubchem_id[drug_id]["PubChem_ID"]
        print(all_drugs_names[drug_id], PubChem_id)
        c = Compound.from_cid(PubChem_id)
        drug_features.loc[drug_id, "molecular_weight"] = c.molecular_weight
   
        drug_features.loc[drug_id, "elements"] = str(set(c.elements)).strip("{").strip("}")
        
        bonds = [int(str(i).split(",")[-1].strip(")")) for i in c.bonds]
        drug_features.loc[drug_id, "2bonds"] = bonds.count(2)
        drug_features.loc[drug_id, "3bonds"] = bonds.count(3)

        drug_features.loc[drug_id, "xlogp"] = c.xlogp
        drug_features.loc[drug_id, "formal_charge"] = c.charge
    
        drug_features.loc[drug_id, "surface_area"] = c.tpsa

        drug_features.loc[drug_id, "complexity"] = c.complexity

        drug_features.loc[drug_id, "h_bond_donor_count"] = c.h_bond_donor_count

        drug_features.loc[drug_id, "h_bond_acceptor_count"] = c.h_bond_acceptor_count

        drug_features.loc[drug_id, "rotatable_bond_count"] = c.rotatable_bond_count

        drug_features.loc[drug_id, "heavy_atom_count"] = c.heavy_atom_count

        drug_features.loc[drug_id, "atom_stereo_count"] = c.atom_stereo_count

        drug_features.loc[drug_id, "defined_atom_stereo_count"] = c.defined_atom_stereo_count

        drug_features.loc[drug_id, "undefined_atom_stereo_count"] = c.undefined_atom_stereo_count

        drug_features.loc[drug_id, "bond_stereo_count"] = c.bond_stereo_count

        drug_features.loc[drug_id, "covalent_unit_count"] = c.covalent_unit_count
        drug_features.loc[drug_id, "molecular_formula"] = c.molecular_formula

        drug_features.loc[drug_id, "canonical_smiles"] = c.canonical_smiles

        drug_features.loc[drug_id, "inchi_string"] = c.inchi

        drug_features.loc[drug_id, "inchi_key"] = c.inchikey
    except:
#         print("Error with drug:", drug_id )
        potential_drug_to_drop.append(drug_id)
        pass

In [33]:
drug_to_drop = [drug for drug in potential_drug_to_drop if drug not in drugs_with_pubchem_id]
len(potential_drug_to_drop), len(drug_to_drop)

(15, 15)

In [48]:
drug_features.drop(drug_to_drop, axis=0).to_csv(_FOLDER_2+ "drug_features_raw.csv")

## End of Part 2: Read prepared data

In [97]:
drug_features = pd.read_csv(_FOLDER_2+ "drug_features_raw.csv").set_index("DRUG_ID")

In [98]:
# pubchem_id is none

drug_features[drug_features["molecular_weight"].isnull()].shape[0]

0

In [100]:
drug_features.columns

Index(['Drug_Name', 'Synonyms', 'Target', 'Target_Pathway', 'molecular_weight',
       'elements', '2bonds', '3bonds', 'xlogp', 'formal_charge',
       'surface_area', 'complexity', 'h_bond_donor_count',
       'h_bond_acceptor_count', 'rotatable_bond_count', 'heavy_atom_count',
       'atom_stereo_count', 'defined_atom_stereo_count',
       'undefined_atom_stereo_count', 'bond_stereo_count',
       'covalent_unit_count', 'molecular_formula', 'canonical_smiles',
       'inchi_string', 'inchi_key'],
      dtype='object')

In [102]:
int_columns = ['2bonds', '3bonds', 'h_bond_donor_count',
       'h_bond_acceptor_count', 'rotatable_bond_count', 'heavy_atom_count',
       'atom_stereo_count', 'defined_atom_stereo_count',
       'undefined_atom_stereo_count', 'bond_stereo_count',
       'covalent_unit_count']
for col in int_columns:
    drug_features[col] = np.int16(drug_features[col])

## Part 3: Preprocessing Text PubChem characteristics

### Presence of some elements (11 elements)

In [103]:
%%time

all_elements = list(set(drug_features["elements"].str.split(",", expand=True).fillna(0).values.flatten())- set([0," 'C'", "'C'", " 'H'"]))
all_elements

elements_in_drugs= list(set([atom.strip(" ").strip("'") for atom in all_elements]))
exceptions =[]
for drug_index in drug_features.index:
    compound_elements = drug_features.loc[drug_index, "elements"]
    print(compound_elements)
    try:
        for i, atom in list(enumerate(elements_in_drugs)):
            if atom in compound_elements:
                drug_features.loc[drug_index, atom] = 1
                print(atom, "Yes")
            else:
                drug_features.loc[drug_index, atom] = 0
                print(atom, "No")
    except:
        exceptions.append(drug_index)
        drug_features.loc[drug_index, atom] = 0
        
for col in ['B', 'I', 'Br', 'Cl', 'O', 'N', 'F', 'P', 'S', 'Pt']:
    drug_features[col]= np.int16(drug_features[col])
        
print("Exceptions:", drug_features.loc[exceptions, :].shape[0])
print("Elements in drugs:", len(elements_in_drugs), elements_in_drugs)

'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'N', 'H', 'S', 'Cl', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl Yes
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'Cl', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl Yes
Pt No
I No
Br No
O Yes
F Yes
'N', 'H', 'S', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'S', 'Cl', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl Yes
Pt No
I No
Br No


B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'S', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F Yes
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'S', 'N', 'C'
N Yes
P No
H Yes
B No

O Yes
F No
'N', 'H', 'Cl', 'C', 'O'
N Yes
P No
H Yes
B No
S No
Cl Yes
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'S', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'S', 'C', 'O'
N Yes
P No
H Yes
B No
S Yes
Cl No
Pt No
I No
Br No
O Yes
F No
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'I', 'H', 'C', 'O', 'F'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I Yes
Br No
O Yes
F Yes
'H', 'O', 'N', 'C'
N Yes
P No
H Yes
B No
S No
Cl No
Pt No
I No
Br No
O Yes
F No
'N', 'H', 'Cl', 'C', 'Br', 'O', 'F'
N Yes
P No
H Yes
B Yes
S No
Cl Yes
Pt No
I

In [104]:
drug_features["Br"].value_counts()

0    243
1      7
Name: Br, dtype: int64

In [105]:
drug_features.to_csv(_FOLDER_2 + "drug_features_with_pubchem_properties.csv")

PubChem_features = ["molecular_weight","2bonds", "3bonds", "xlogp", "formal_charge", 
    "surface_area", "complexity", "h_bond_donor_count", 
    "h_bond_acceptor_count", "rotatable_bond_count",
    "heavy_atom_count", "atom_stereo_count", "defined_atom_stereo_count",
    "undefined_atom_stereo_count", "bond_stereo_count", "covalent_unit_count",
    'B', 'I', 'Br', 'Cl', 'O', 'N', 'F', 'P', 'S', 'Pt']

with open(_FOLDER_2 + "X_PubChem_properties.txt", 'w') as f:
    for s in PubChem_features:
        f.write(str(s) + '\n')

print("Number of PubChem features:", len(PubChem_features))

Number of PubChem features: 26


## End of Part 3: Read Prepared Data

In [106]:
drug_features = pd.read_csv(_FOLDER_2+ "drug_features_with_pubchem_properties.csv").set_index("DRUG_ID")

with open(_FOLDER_2 + "X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]

## Part 4: Preprocessing Drugs description from original data

In this section, we are going to have some dumnies columns for Target and Target_Pathway

Converting of Target Pathway resulted in 26 new columns

It is also worth considering elements columns and that deleting columns with C and H which are present in all the compounds

### Dumnies for Target (229) and Target_Pathway (23)

In [107]:
drug_features.head(3)

Unnamed: 0_level_0,Drug_Name,Synonyms,Target,Target_Pathway,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,...,P,H,B,S,Cl,Pt,I,Br,O,F
DRUG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling,393.4,"'H', 'O', 'N', 'C'",8,1,3.3,0.0,...,0,1.0,0,0,0,0,0,0,1,0
3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling,914.2,"'H', 'O', 'N', 'C'",9,0,6.0,0.0,...,0,1.0,0,0,0,0,0,0,1,0
5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling,398.5,"'N', 'H', 'C', 'O', 'F'",8,0,2.6,0.0,...,0,1.0,0,0,0,0,0,0,1,1


In [108]:
targets = ""
for x in drug_features["Target"].values:
    targets = targets + ", " + x
targets = list(set(targets.split(", ")[1:]))
print("Number of targets:", len(targets))

df_target = pd.DataFrame(data = np.int32(np.zeros([drug_features.shape[0], len(targets)])), 
                         index = drug_features.index, 
                         columns = targets)

Number of targets: 212


In [109]:
for index in drug_features.index:
    targets_i = drug_features.loc[index, "Target"].split(", ")
    df_target.loc[index, targets_i]=1
df_target.shape

(250, 212)

In [110]:
print("Number of unique pathways:", drug_features["Target_Pathway"].nunique())

df_target_target_pathway = pd.concat([df_target, pd.get_dummies(drug_features["Target_Pathway"])], axis=1)
df_target_target_pathway.shape

Number of unique pathways: 23


(250, 235)

In [111]:
df_final = pd.concat([drug_features.drop(["Target_Pathway"], axis=1), df_target_target_pathway], axis=1)
df_final.shape

(250, 270)

In [112]:
df_final.to_csv(_FOLDER_2 + "drug_features_with_pubchem_properties_final.csv")

with open(_FOLDER_2 + "X_features_Targets.txt", 'w') as f:
    for s in targets:
        f.write(str(s) + '\n')
        
with open(_FOLDER_2 + "X_features_Target_Pathway.txt", 'w') as f:
    for s in drug_features["Target_Pathway"].unique():
        f.write(str(s) + '\n')   

## End of Part 4: Read Prepared Data

In [120]:
drug_features = pd.read_csv(_FOLDER_2 + "drug_features_with_pubchem_properties_final.csv")

with open(_FOLDER_2 + "X_features_Targets.txt", 'r') as f:
    X_features_Targets = [line.rstrip('\n') for line in f]
    
with open(_FOLDER_2 + "X_features_Target_Pathway.txt", 'r') as f:
    X_features_Target_Pathway = [line.rstrip('\n') for line in f]

In [121]:
drug_features.head()

Unnamed: 0,DRUG_ID,Drug_Name,Synonyms,Target,molecular_weight,elements,2bonds,3bonds,xlogp,formal_charge,...,JNK and p38 signaling,Metabolism,Mitosis,Other,"Other, kinases",PI3K/MTOR signaling,Protein stability and degradation,RTK signaling,WNT signaling,p53 pathway
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,393.4,"'H', 'O', 'N', 'C'",8,1,3.3,0.0,...,0,0,0,0,0,0,0,0,0,0
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,914.2,"'H', 'O', 'N', 'C'",9,0,6.0,0.0,...,0,0,0,0,0,1,0,0,0,0
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",398.5,"'N', 'H', 'C', 'O', 'F'",8,0,2.6,0.0,...,0,0,0,0,0,0,0,1,0,0
3,6,PHA-665752,"PHA665752, PHA 665752",MET,641.6,"'N', 'H', 'S', 'Cl', 'C', 'O'",13,0,5.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",475.6,"'H', 'O', 'N', 'C'",7,0,4.8,0.0,...,0,0,0,0,0,0,1,0,0,0


In [122]:
with open(_FOLDER_2+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]

In [123]:
print("Final Features: \n")
print("Cell lines (CCL) features:", len(X_cancer_cell_lines))
print("PubChem drug features:", len(PubChem_features))
print("Drug description features - Targets: %d, Target_Pathway: %d" % (len(X_features_Targets), len(X_features_Target_Pathway)))

Final Features: 

Cell lines (CCL) features: 1073
PubChem drug features: 26
Drug description features - Targets: 212, Target_Pathway: 23


In [124]:
all_elements

["'H'",
 " 'Cl'",
 " 'N'",
 " 'B'",
 " 'I'",
 "'N'",
 " 'P'",
 " 'O'",
 " 'Pt'",
 " 'Br'",
 "'O'",
 " 'F'",
 " 'S'"]