## Import libraries

In [146]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load TTD Drug to interactions file

In [206]:
df = pd.read_table('Input/TTDDRUGID_Indication.txt')

In [207]:
df.head()

Unnamed: 0,TTDDRUGID,LNM,Indication,ICD9,ICD10
0,DAP000001,Quetiapine,Schizophrenia,"295, 710.0","F20, M32"
1,DAP000002,Theophylline,Chronic obstructive pulmonary disease,"490-492, 494-496","J40-J44, J47"
2,DAP000003,Risperidone,Schizophrenia,"295, 710.0","F20, M32"
3,DAP000004,Dasatinib,Chronic myelogenous leukemia,"205.1, 208.9","C91-C95, C92.1"
4,DAP000004,Dasatinib,Solid tumours; Multiple myeloma,"140-199, 203.0, 210-229","C00-C75, C7A, C7B, C90.0, D10-D36, D3A"


In [208]:
df.shape

(20180, 5)

#### Load Meddra Indictations

In [209]:
meddra_indict = pd.read_table('Input/meddra_all_indications.tsv', names = ['STITCH ID','UMLS Concept ID Label','Method Detection','Concept Name','Concept Type','UMLS Concept ID MedDRA', 'MedDRA Concept Name'])

In [210]:
meddra_indict.head()

Unnamed: 0,STITCH ID,UMLS Concept ID Label,Method Detection,Concept Name,Concept Type,UMLS Concept ID MedDRA,MedDRA Concept Name
0,CID100000085,C0015544,text_mention,Failure to Thrive,LLT,C0015544,Failure to thrive
1,CID100000085,C0015544,text_mention,Failure to Thrive,PT,C0015544,Failure to thrive
2,CID100000085,C0020615,text_mention,Hypoglycemia,LLT,C0020615,Hypoglycaemia
3,CID100000085,C0020615,text_mention,Hypoglycemia,PT,C0020615,Hypoglycaemia
4,CID100000085,C0022661,NLP_indication,"Kidney Failure, Chronic",LLT,C0022661,Renal failure chronic


#### Load LINCS Small Molecules

In [211]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv', encoding = 'ISO-8859-1')

## Split up Indication column in DF

In [212]:
how_many = 0
appended_df = []
indict_index = np.where(df.columns.values=='Indication')[0][0]

for index, row in df.iterrows():
    indict_group = row.loc['Indication']
    if '; ' in indict_group:
        indict_split = indict_group.split('; ')
        for i in indict_split:
            row_as_list = row.values.tolist()
            row_as_list[indict_index] = i
            appended_df.append(row_as_list)
        df.drop(index, inplace = True)
        how_many += 1

print(len(appended_df))
print(how_many)

1216
538


In [213]:
columnnames = list(df.columns.values)
fix_gene_df = pd.DataFrame(appended_df,columns = columnnames)

In [214]:
fix_gene_df.head()

Unnamed: 0,TTDDRUGID,LNM,Indication,ICD9,ICD10
0,DAP000004,Dasatinib,Solid tumours,"140-199, 203.0, 210-229","C00-C75, C7A, C7B, C90.0, D10-D36, D3A"
1,DAP000004,Dasatinib,Multiple myeloma,"140-199, 203.0, 210-229","C00-C75, C7A, C7B, C90.0, D10-D36, D3A"
2,DAP000006,Sorafenib,Hepatocellular carcinoma,"155, 162, 172","C22.0, C33, C34, C43"
3,DAP000006,Sorafenib,NSCLC,"155, 162, 172","C22.0, C33, C34, C43"
4,DAP000006,Sorafenib,Melanoma,"155, 162, 172","C22.0, C33, C34, C43"


In [215]:
df = df.append(fix_gene_df)

## Map Indications to their CUI from MedDRA table

In [216]:
meddra_indict = meddra_indict[['MedDRA Concept Name', 'UMLS Concept ID MedDRA']]

In [217]:
meddra_indict.head()

Unnamed: 0,MedDRA Concept Name,UMLS Concept ID MedDRA
0,Failure to thrive,C0015544
1,Failure to thrive,C0015544
2,Hypoglycaemia,C0020615
3,Hypoglycaemia,C0020615
4,Renal failure chronic,C0022661


In [218]:
meddra_indict.set_index('MedDRA Concept Name', inplace = True)

In [219]:
meddra_indict=meddra_indict.drop_duplicates()

In [220]:
df['Concept ID'] = None
df.head()

Unnamed: 0,TTDDRUGID,LNM,Indication,ICD9,ICD10,Concept ID
0,DAP000001,Quetiapine,Schizophrenia,"295, 710.0","F20, M32",
1,DAP000002,Theophylline,Chronic obstructive pulmonary disease,"490-492, 494-496","J40-J44, J47",
2,DAP000003,Risperidone,Schizophrenia,"295, 710.0","F20, M32",
3,DAP000004,Dasatinib,Chronic myelogenous leukemia,"205.1, 208.9","C91-C95, C92.1",
5,DAP000005,Sunitinib,Advanced renal cell carcinoma,189,C64,


In [221]:
for index,row in df.iterrows():
    indict = row.loc['Indication']
    if indict in list(meddra_indict.index):
        df["Concept ID"][index] = meddra_indict['UMLS Concept ID MedDRA'][indict]

In [225]:
df.head()

Unnamed: 0,TTDDRUGID,LNM,Indication,ICD9,ICD10,Concept ID
0,DAP000001,Quetiapine,Schizophrenia,"295, 710.0","F20, M32",C0036341
1,DAP000002,Theophylline,Chronic obstructive pulmonary disease,"490-492, 494-496","J40-J44, J47",C0024117
2,DAP000003,Risperidone,Schizophrenia,"295, 710.0","F20, M32",C2239176
10,DAP000007,Dexamethasone,Rheumatold arthritis,714,M05-M06,C0036983
11,DAP000008,Eletriptan,Migraine,346,G43,C0149931


In [223]:
df.shape

(20858, 6)

In [224]:
df = df.dropna(subset=['Concept ID']) 

## Get Pubchem ID and Map to Drug Name

In [194]:
# namesdf = df['LNM']
# namesdf = namesdf.drop_duplicates()
# namesdf.shape

In [195]:
# nameslist = namesdf.tolist()
# failed_to_get_CID = 0
# CID_dict = {}

# for name in nameslist:
#     name = name.replace(' ','%20')
#     url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
#     response = requests.get(url)
#     try:
#         response.json()
#     except ValueError:
#         continue
#     if 'IdentifierList' in response.json().keys():
#         CID = response.json()['IdentifierList']['CID'][0]
#         name = name.replace('%20', ' ')
#         CID_dict[name] = CID
#     else:
#         failed_to_get_CID += 1
        
#     if len(CID_dict) % 100 == 0:
#         name = name.replace('%20',' ')
#         print(nameslist.index(name))

# print(failed_to_get_CID)
# print(len(CID_dict))
# print(len(nameslist))

In [226]:
df['CIDs'] = None
for index,row in df.iterrows():
    drugname = row.loc['LNM']
    if drugname in CID_dict:
        df["CIDs"][index] = int(CID_dict[drugname])

In [227]:
df.head()

Unnamed: 0,TTDDRUGID,LNM,Indication,ICD9,ICD10,Concept ID,CIDs
0,DAP000001,Quetiapine,Schizophrenia,"295, 710.0","F20, M32",C0036341,3062316
1,DAP000002,Theophylline,Chronic obstructive pulmonary disease,"490-492, 494-496","J40-J44, J47",C0024117,3062316
2,DAP000003,Risperidone,Schizophrenia,"295, 710.0","F20, M32",C2239176,216239
10,DAP000007,Dexamethasone,Rheumatold arthritis,714,M05-M06,C0036983,439260
11,DAP000008,Eletriptan,Migraine,346,G43,C0149931,439260


In [228]:
df = df.dropna(subset=['CIDs']) 
df = df.dropna(subset=['Concept ID']) 

In [229]:
df.shape

(4376, 7)

In [230]:
df = df[['Concept ID','CIDs']]
df = df.reset_index()
df.head()

Unnamed: 0,index,Concept ID,CIDs
0,0,C0036341,3062316
1,1,C0024117,3062316
2,2,C2239176,216239
3,10,C0036983,439260
4,11,C0149931,439260


## Filter by LINCS Approved Small Molecules

In [231]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

2945


## Make Binary Matrix

In [232]:
grouped_df = df.groupby(['Concept ID'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()
grouped_df.set_index('Concept ID', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Concept ID'].unique())

200

In [233]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')
grouped_matrix.head()

Unnamed: 0_level_0,10071196,10090485,10117987,10127622,10182969,10201696,10253143,10288191,10295295,10296883,...,9915743,9924495,9930049,9931954,9933475,9935681,9941444,9952884,9953599,9960285
Concept ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0002170,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0002438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0002622,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [234]:
grouped_matrix.shape

(200, 746)

## Save Binary Matrix

In [235]:
filename = 'Output/TTD_Indications_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [236]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)
len(max(dsl, key=len))

49

In [237]:
filename = 'Output/TTD_Indications_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')