In [1]:
### Generates output with phosphosites

import pandas as pd
import fileinput

In [2]:
# read in data ----- there is also mouse and rat data. i will only be using human for now
df = pd.read_csv('input/phosphoELM_vertebrate_2015-04.dump', sep = '\t')
df = df.dropna(subset = ['kinases'])
df = df[df.species == 'Homo sapiens'] # only using human data
df['psites'] = df['code'].map(str) + df['position'].map(str)

df.loc[:, ['kinases','psites']].head(10)

Unnamed: 0,kinases,psites
14,Lck,Y204
16,Lck,Y221
18,PKB_group,S824
20,PKB_group,S227
21,SRC,Y707
27,IKK_group,S740
28,IKK_group,S675
29,IKK_group,S689
30,IKK_group,S705
31,IKK_group,S682


In [3]:
# file with gene symbol and uniprotid
df1 = pd.read_csv('input/genesymboltouniprot_hgnc.txt', sep = '\t')
df1 = df1.dropna()
df1.head()

# map uniprot id to list of approved symbols
di = {} # {uniprot id, [approved symbol(s)]}
for p1, p2 in zip(list(df1.loc[:, 'Approved symbol']), list(df1.loc[:, 'UniProt ID(supplied by UniProt)'])):
    if p2 in di:
        di[p2].append(p1)
    else:
        di[p2] = [p1]

In [4]:
# read in list of human kinases
inp = fileinput.FileInput('input/kinases_human.txt')
pkin = []
for line in inp:
    pkin.append(line.split('\n')[0])


In [5]:
df.loc[:, ['acc', 'kinases', 'psites']].head()

Unnamed: 0,acc,kinases,psites
14,O14543,Lck,Y204
16,O14543,Lck,Y221
18,O14746,PKB_group,S824
20,O14746,PKB_group,S227
21,O14746,SRC,Y707


In [6]:
# find ksis -- filtering using pkin because some 'kinases' in table are kinase groups
d = {}
d1 = {}
for p1, k, psite in zip(df.loc[:, 'acc'], df.loc[:, 'kinases'], df.loc[:, 'psites']): # acc is uniprot id for substrate
    if p1 in di:
        p1 = di[p1]
        if k in pkin:
            for x in p1:
                if k not in d:
                    d[k] = {'{0}_{1}'.format(x, psite)}
                    d1[k] = {x}
                else:
                    d[k].add('{0}_{1}'.format(x, psite))
                    d1[k].add(x)
print(len(d))

77


In [7]:
# write to file, do calculations
allnp = open('output_phosphoelm_all_human.gmt', 'w+')
fnp = open('output_phosphoelm_fourplusinteractions_human.gmt', 'w+')
allwp = open('output-withphosphosite_phosphoelm_all_human.gmt', 'w+')
#fwp = open('output-withphosphosite_phosphoelm_fourplusinteractions_human.gmt', 'w+')
uSubs = set()
numKSI = 0
numKins = 0
for k in d:
    if len(d1[k]) >= 4:
        numKSI += len(d1[k])
        numKins += 1
        {uSubs.add(x) for x in d1[k]}
        fnp.write('{0}_phosphoelm_human\t'.format(k) + '\t'.join(d1[k]) + '\n')
        #fwp.write('{0}_phosphoelm_human\t'.format(k) + '\t'.join(d1[k]) + '\n')
    else:
        allnp.write('{0}_phosphoelm_human\t'.format(k) + '\t'.join(d1[k]) + '\n')
        allwp.write('{0}_phosphoelm_human\t'.format(k) + '\t'.join(d[k]) + '\n')
    
        
print('{0}\t#kins: {1}\t#ksi: {2}\t#usubs: {3}'.format('human', numKins, numKSI, len(uSubs)))

allnp.close()
fnp.close()
allwp.close()
#fwp.close()


human	#kins: 27	#ksi: 416	#usubs: 312
