## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load DrugBank Data

In [None]:
df = pd.read_csv('Input/drugbank_targets.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['Type'].unique()

In [None]:
len(df['Name'].unique())

#### Load UniProt ID table

In [None]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [None]:
uniprot.head()

In [None]:
uniprot.shape

In [None]:
uniprot = uniprot[pd.notnull(uniprot['UniProt ID'])]

#### Load Targets Mapping File

In [None]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [None]:
target_update.head()

In [None]:
target_update.set_index('Old Targets',inplace = True)

#### Load LINCS Small Molecules

In [None]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

In [None]:
lincs.head()

## Map UniProt ID to Gene

In [None]:
df.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID',inplace=True)

In [None]:
uniprot.head()

In [None]:
df = pd.merge(df, uniprot, how= 'left', on = 'UniProt ID')

In [None]:
df = df.rename(index=str, columns = {'Approved Symbol':'Old Targets'})

In [None]:
df.reset_index(inplace=True)

In [None]:
df.head(4)

In [None]:
df = df.dropna(subset=['Old Targets']) 

## Update Target Names

In [None]:
df.set_index('Old Targets', inplace = True)

In [None]:
df.head()

In [None]:
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')

In [None]:
df.reset_index(inplace=True)

In [None]:
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})

In [None]:
df = df.dropna(subset=['Targets']) 

In [None]:
df.shape

In [None]:
df.sort_index()

## Get PubChemID and Map to drug name

In [None]:
namesdf = df['Name']
namesdf = namesdf.drop_duplicates()

In [None]:
nameslist = namesdf.tolist()
failed_to_get_CID = 0
CID_dict = {}

for name in nameslist:
    name = name.replace(' ','%20')
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + name + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        CID = response.json()['IdentifierList']['CID'][0]
        name = name.replace('%20', ' ')
        CID_dict[name] = CID
    else:
        failed_to_get_CID += 1
        
    if len(CID_dict) % 100 == 0:
        name = name.replace('%20',' ')
        print(nameslist.index(name))

print(failed_to_get_CID)
print(len(CID_dict))
print(len(nameslist))


In [None]:
CIDs = []
for index, row in df.iterrows():
    drugname = row.loc['Name']
    if drugname not in CID_dict:
        df.drop(index, inplace = True)
    else: 
        CIDs.append(int(CID_dict[drugname]))

df.loc[:,'CIDs'] = pd.Series(np.array(CIDs), index=df.index)

In [None]:
df.head()

In [None]:
df.shape

## Filter by LINCS approved Small Molecules

In [None]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['CIDs']
    if CID not in lincs['SM_PubChem_CID'].values:

        df.drop(index, inplace = True)
        count += 1

print(count)


In [None]:
df.head()

In [None]:
df.shape

## Make Binary Matrix

In [None]:
grouped_df = df.groupby(['Targets'])['CIDs'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [None]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()

In [None]:
len(df['Targets'].unique())

In [None]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [None]:
grouped_matrix.head()

In [None]:
grouped_matrix.shape

## Save Binary Matrix

In [None]:
# grouped_matrix.to_csv('Output/DrugBank_Targets.csv')

In [None]:
filename = 'Output/DrugBank_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT

In [None]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
    
dsl = np.array(drugset_library)

In [None]:
len(max(dsl, key=len))

In [None]:
filename = 'Output/DrugBank_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')   