## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests
import chardet

## Load Data

#### Load TTD to UniProt

In [41]:
ttd_uni = pd.read_table('Input/TTD_uniprot_all.txt', engine = 'python')

In [42]:
ttd_uni.head()

Unnamed: 0,TTD Target ID,Target Name,Target Type,Uniprot ID
0,TTDS00002,Muscarinic acetylcholine receptor M1,Successful target,P11229
1,TTDS00003,Muscarinic acetylcholine receptor M2,Successful target,P08172
2,TTDS00004,Muscarinic acetylcholine receptor M3,Successful target,P20309
3,TTDS00005,Muscarinic acetylcholine receptor M4,Successful target,P08173
4,TTDS00006,Muscarinic acetylcholine receptor M5,Successful target,P08912


#### Load UniProt to Gene

In [43]:
uniprot = pd.read_table('Input/gene_to_uniprot.txt')

In [44]:
uniprot.head()

Unnamed: 0,Approved Symbol,UniProt ID
0,A1BG,P04217
1,A1BG-AS1,
2,A1CF,Q9NQ94
3,A2M,P01023
4,A2M-AS1,


In [45]:
uniprot = uniprot.rename(index=str, columns = {'Approved Symbol':'Gene'})

## Fix TTD UniProt Column

In [46]:
how_many = 0
appended_ttd = []
uni_index = np.where(ttd_uni.columns.values=='Uniprot ID')[0][0]

for index, row in ttd_uni.iterrows():
    uni_group = row.loc['Uniprot ID']
    if '; ' in uni_group:
        uni_split = uni_group.split('; ')
        for i in uni_split:
            row_as_list = row.values.tolist()
            row_as_list[uni_index] = i
            appended_ttd.append(row_as_list)
        ttd_uni.drop(index, inplace = True)
        how_many += 1
print(len(appended_ttd))
print(how_many)

386
120


In [47]:
columnnames = list(ttd_uni.columns.values)
fix_ttd_uni = pd.DataFrame(appended_ttd,columns = columnnames)

In [48]:
fix_ttd_uni.head()

Unnamed: 0,TTD Target ID,Target Name,Target Type,Uniprot ID
0,TTDS00059,"Dihydroorotate dehydrogenase, mitochondrial",Successful target,Q02127
1,TTDS00059,"Dihydroorotate dehydrogenase, mitochondrial",Successful target,Q08210
2,TTDS00077,Glycoprotein IIb/IIIa receptor,Successful target,P05106
3,TTDS00077,Glycoprotein IIb/IIIa receptor,Successful target,P08514
4,TTDS00139,Interleukin-2 receptor,Successful target,P01589


In [49]:
ttd_uni = ttd_uni.append(fix_ttd_uni)

In [55]:
ttd_uni.head()

Unnamed: 0,UniProt ID,TTD Target ID,Target Name,Target Type,Gene
0,P11229,TTDS00002,Muscarinic acetylcholine receptor M1,Successful target,CHRM1
1,P08172,TTDS00003,Muscarinic acetylcholine receptor M2,Successful target,CHRM2
2,P20309,TTDS00004,Muscarinic acetylcholine receptor M3,Successful target,CHRM3
3,P08173,TTDS00005,Muscarinic acetylcholine receptor M4,Successful target,CHRM4
4,P08912,TTDS00006,Muscarinic acetylcholine receptor M5,Successful target,CHRM5


In [51]:
ttd_uni = ttd_uni.rename(index=str, columns = {'Uniprot ID':'UniProt ID'})

## Map TTD target to Gene

In [52]:
ttd_uni.set_index('UniProt ID', inplace = True)
uniprot.set_index('UniProt ID', inplace = True)

In [53]:
ttd_uni = pd.merge(ttd_uni, uniprot, how = 'left', on = 'UniProt ID')
ttd_uni.shape

(3535, 4)

In [54]:
ttd_uni.reset_index(inplace = True)

In [56]:
ttd_uni = ttd_uni.dropna(subset=['Gene'])

In [57]:
ttd_uni.shape

(3039, 5)

## Save as a CSV

In [59]:
filename = 'TTD_to_Gene_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
ttd_uni.to_csv(filename, sep='\t', compression='gzip')