In [1]:
import pandas as pd
import fileinput

In [2]:
### PREPROCESSING - create dictionary that maps ENSP to gene ID
sup1 = pd.read_csv('input/9606.protein.info.v11.0.txt', delimiter = '\t') # file contains ENSP and gene ids *or* ENSG ids
sup1['protein_external_id'] = sup1['protein_external_id'].apply(lambda x: x.replace('9606.', ''))
sup1 = sup1.iloc[:, [0,1]]
sup1.head(10)


Unnamed: 0,protein_external_id,preferred_name
0,ENSP00000000233,ARF5
1,ENSP00000000412,M6PR
2,ENSP00000001008,FKBP4
3,ENSP00000001146,CYP26B1
4,ENSP00000002125,NDUFAF7
5,ENSP00000002165,FUCA2
6,ENSP00000002596,HS3ST1
7,ENSP00000002829,SEMA3F
8,ENSP00000003084,CFTR
9,ENSP00000003100,CYP51A1


In [3]:
sup2 = pd.read_csv('input/ensg_to_geneid_hgnc.txt', sep = '\t') # file contains ENSG and gene ids
sup2.head()


Unnamed: 0,Approved symbol,Ensembl gene ID
0,A1BG,ENSG00000121410
1,A1BG-AS1,ENSG00000268895
2,A1CF,ENSG00000148584
3,A2M,ENSG00000175899
4,A2M-AS1,ENSG00000245105


In [4]:
temp = sup1.merge(sup2, how = 'left', left_on = 'preferred_name', right_on = 'Ensembl gene ID')
temp.head()
#sup1['protein_external_id'] = sup1['protein_external_id'].apply(lambda x: x.replace('9606.', ''))



Unnamed: 0,protein_external_id,preferred_name,Approved symbol,Ensembl gene ID
0,ENSP00000000233,ARF5,,
1,ENSP00000000412,M6PR,,
2,ENSP00000001008,FKBP4,,
3,ENSP00000001146,CYP26B1,,
4,ENSP00000002125,NDUFAF7,,


In [5]:
temp1 = temp.dropna()
temp1 = temp1.drop(columns = ['preferred_name', 'Ensembl gene ID'])
temp1 = temp1.rename(columns = {'Approved symbol' : 'ENSG_gene'})
temp1.head()

Unnamed: 0,protein_external_id,ENSG_gene
1304,ENSP00000243314,MAGEA9B
1684,ENSP00000252490,APOC4-APOC2
1825,ENSP00000254627,OC90
2405,ENSP00000261789,TM9SF1
2500,ENSP00000262283,OC90


In [6]:
combo = temp.merge(temp1, how = 'left', left_on = 'preferred_name', right_on = 'ENSG_gene')
combo.drop(columns = ['Approved symbol', 'Ensembl gene ID', 'protein_external_id_y']).head()

Unnamed: 0,protein_external_id_x,preferred_name,ENSG_gene
0,ENSP00000000233,ARF5,
1,ENSP00000000412,M6PR,
2,ENSP00000001008,FKBP4,
3,ENSP00000001146,CYP26B1,
4,ENSP00000002125,NDUFAF7,


In [7]:
ensp_gene = {} # maps ENSP to gene ids
for pid, name, ensg_id in zip(combo.iloc[:, 0], combo.iloc[:, 1], combo.iloc[:, 2]):
    if 'ENSG' not in name:
        ensp_gene[pid] = name

    elif 'ENSG' in name and ensg_id == ensg_id: # able to get gene id from ensg input datafile
        ensp_gene[pid] = ensg_id

In [8]:
for x in ensp_gene:
    if type(ensp_gene[x]) != str:
        print('{0}\t{1}'.format(type(x), x))

In [None]:
### read in input data

df = pd.read_csv('input/9606.protein.links.v11.0.txt', sep = ' ')



In [None]:
df.head()

In [None]:
protein1 = [x.split('.')[1] for x in list(df.loc[:, 'protein1'])]
protein2 = [x.split('.')[1] for x in list(df.loc[:, 'protein2'])]

In [None]:
# for input data, convert ensp to gene ids
out1 = []
out2 = []
for p1, p2 in zip(protein1, protein2):
    if p1 in ensp_gene and p2 in ensp_gene:
        out1.append(ensp_gene[p1])
        out2.append(ensp_gene[p2])


In [None]:
# read in list of human kinases
inp = fileinput.FileInput('input/kinases_human.txt')
pkin = []
for line in inp:
    pkin.append(line.split('\n')[0])

In [None]:
# find KSIs in input data -- takes a few min
d = {}
#for p1, p2 in zip(out1[0:1000], out2[0:100]):
for p1, p2 in zip(out1, out2):
    #print('{0}\t{1}'.format(p1, p2))
    if p1 in pkin:
        if p1 not in d:
            d[p1] = {p2}
        else:
            d[p1].add(p2)
    if p2 in pkin:
        if p2 not in d:
            d[p2] = {p1}
        else:
            d[p2].add(p1)


In [249]:
# write to file
allnp = open('output_string_all_human.gmt', 'w+')
fnp = open('output_string_fourplusinteractions_human.gmt', 'w+')
uSubs = set()
numKSI = 0
numKins = 0
for k in d:
    allnp.write('{0}_string_human\t'.format(k) + '\t'.join(d[k]) + '\n')
    if len(d[k]) >= 4:
        numKSI += len(d[k])
        numKins += 1
        {uSubs.add(x) for x in d[k]}
        fnp.write('{0}_string_human\t'.format(k) + '\t'.join(d[k]) + '\n')
    
        
print('{0}\t#kins: {1}\t#ksi: {2}\t#usubs: {3}'.format('human', numKins, numKSI, len(uSubs)))

allnp.close()
fnp.close()


human	#kins: 323	#ksi: 398579	#usubs: 17698
