## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load Matador file

In [8]:
raw_df = pd.read_table('Input/matador.tsv')

In [9]:
raw_df.head()

Unnamed: 0,chemical_id,chemical_name,atc,protein_id,protein_name,mesh_id,uniprot_id,protein_score,protein_annotation,mesh_score,mesh_annotation,matador_score,matador_annotation
0,11954269,everolimus,L04AA18,9606.ENSP00000354587,FRAP1,,Q9Y4I3 Q96QW8_HUMAN Q96QG3 Q6LE87 Q5TER3_HUMAN...,950,DIRECT,0,,950,DIRECT
1,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000255040,APCS,D000209,P02743,0,,207,INDIRECT,207,INDIRECT
2,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000273550,FTH1,D000209,Q3SWW1 P02794 Q6NZ44_HUMAN,0,,207,INDIRECT,207,INDIRECT
3,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000336829,FGG,D000209,P04470 P04469 P02679 Q9UC63_HUMAN Q9UC62_HUMAN...,0,,207,INDIRECT,207,INDIRECT
4,11954225,gold sodium thiomalate,M01CB01,9606.ENSP00000348068,SERPINA1,D000209,P01009 Q9UCM3_HUMAN Q9UCE6_HUMAN Q9P1P0 Q96ES1...,0,,207,INDIRECT,207,INDIRECT


#### Load Targets Mapping file

In [4]:
target_update = pd.read_table('Input/mappingFile_2017.txt', names = ['Old Targets','Updated Targets'], engine='python')

In [5]:
target_update.head()

Unnamed: 0,Old Targets,Updated Targets
0,A1BG,A1BG
1,A1BG-AS1,A1BG-AS1
2,NCRNA00181,A1BG-AS1
3,A1BGAS,A1BG-AS1
4,A1BG-AS,A1BG-AS1


#### Load LINCS Small Molecules

In [6]:
lincs = pd.read_csv('Input/LINCS_SmallMolecules.csv',encoding='ISO-8859-1')

## Make DF of PCID and protein ID

In [10]:
df = raw_df[['chemical_id','protein_id']]

In [11]:
df = df.rename(index=str, columns = {'chemical_id':'PCID'})

In [12]:
df.head()

Unnamed: 0,PCID,protein_id
0,11954269,9606.ENSP00000354587
1,11954225,9606.ENSP00000255040
2,11954225,9606.ENSP00000273550
3,11954225,9606.ENSP00000336829
4,11954225,9606.ENSP00000348068


## Get Gene Name from Stitch API and make dictionary

In [13]:
prot_ID_df = df['protein_id']
prot_ID_df = prot_ID_df.drop_duplicates()
prot_ID_df.shape

(2901,)

In [14]:
prot_ID_list = prot_ID_df.tolist()
failed_to_get_name = 0
gene_name_dict = {}

for prot_ID in prot_ID_list:
    url = 'http://stitch.embl.de/api/json/resolve?identifier=' + prot_ID
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'preferredName' in response.json()[0].keys():
        gene_name = response.json()[0]['preferredName']
        gene_name_dict[prot_ID] = gene_name
    else:
        failed_to_get_name +=1
    
    if len(gene_name_dict) % 100 == 0:
        print(prot_ID_list.index(prot_ID))
    

print(failed_to_get_name)
print(len(gene_name_dict))
print(len(prot_ID_list))

167
329
545
730
954
1154
1320
1510
1700
1925
2127
2319
2524
2707
0
1486
2901


In [16]:
gene_names = []
for index, row in df.iterrows():
    prot_ID = row.loc['protein_id']
    if prot_ID not in gene_name_dict:
        df.drop(index, inplace = True)
    else: 
        gene_names.append((gene_name_dict[prot_ID]))

df.loc[:,'Old Targets'] = pd.Series(np.array(gene_names), index=df.index)

## Update Target Names

In [17]:
df.set_index('Old Targets', inplace = True)
target_update.set_index('Old Targets',inplace = True)
df = pd.merge(df, target_update, how= 'left', on = 'Old Targets')
df.shape

(8455, 3)

In [18]:
df.reset_index(inplace=True)
df = df.rename(index=str, columns = {'Updated Targets':'Targets'})
df = df.dropna(subset=['Targets']) 
df.shape

(8402, 4)

## Filter by LINCS Approved Small Molecules

In [19]:
count = 0
for index, row in df.iterrows():
    CID = row.loc['PCID']
    if CID not in lincs['SM_PubChem_CID'].values:
        df.drop(index, inplace = True)
        count += 1
        
print(count)

2956


## Make Binary Matrix

In [20]:
grouped_df = df.groupby(['Targets'])['PCID'].apply(lambda x: ','.join(x.astype(str))).reset_index()

In [21]:
grouped_df.set_index('Targets', inplace=True)
grouped_df = grouped_df.sort_index()
len(df['Targets'].unique())

1244

In [22]:
grouped_matrix = grouped_df.iloc[:,0].str.get_dummies(sep=',')

In [23]:
grouped_matrix.head()

Unnamed: 0_level_0,104741,10783,110635,119607,1234,123631,124087,12555,128919,1302,...,68844,71158,71329,7510,7741,77999,91270,92400,9801,9904
Targets,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
AADAC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AADAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AANAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
grouped_matrix.shape

(1244, 402)

## Save Targets Binary Matrix

In [25]:
filename = 'Output/Matador_Targets_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
grouped_matrix.to_csv(filename, sep='\t', compression='gzip')

## Convert Binary Matrix to GMT and Save as GMT

In [26]:
drugset_library = []
for index, row in grouped_matrix.iterrows():
    drugset = []
    drugset.append(index)
    drugset.append('')
    for i in range(grouped_matrix.shape[1]):
        if row.iloc[i]==1:
            drugset.append(row.index[i])
    if len(drugset) >= 7:
        drugset_library.append(drugset)
dsl = np.array(drugset_library)

In [27]:
len(max(dsl, key=len))

59

In [28]:
filename = 'Output/Matador_Targets_DrugSetLibrary_%s.gmt'% str(datetime.date.today())[0:7].replace('-', '_')
with open(filename,'w',encoding='utf-8') as f:
    for row in dsl:
        np.savetxt(f, [row], fmt = '%s',delimiter = '\t')