In [1]:
import pandas as pd
import re

In [8]:
def biogrid_reformat1(o, filename, orgInteractor):
    # get list of kinases for o
    pkin = open('input/kinases_{0}.txt'.format(o), 'r')
    pkinases = [] # all kinases for o listed in pkinfam.txt (uniprot)
    for line in pkin:
        pkinases.append(line.split('\n')[0])
    #print(pkinases)
    
    # biogrid data for o
    df = pd.read_csv(filename, delimiter = '\t', low_memory = False)
    lst1 = df.loc[:,'Official Symbol Interactor A']
    lst2 = df.loc[:,'Official Symbol Interactor B']
    org1 = df.loc[:, 'Organism Interactor A']
    org2 = df.loc[:, 'Organism Interactor B']
    d = dict((i, set()) for i in pkinases) # dictionary (kinase : set of substrates)
    substrates = set() # unique substrates
    totalNumInteractions = 0 # total number of kinase - substrate interactions
    
    # find kinase interactions
    for p1, p2, o1, o2 in zip(lst1, lst2, org1, org2):        
        if o1 == orgInteractor and o2 == orgInteractor: # ensures human - human, mouse - mouse interactions
            #print('{0}, {1}'.format(p1, p2))
            if p1 in pkinases:
                temp = d[p1].copy()
                temp.add(p2)
                d[p1] = temp
                #substrates.add(p2)
                #totalNumInteractions += 1
            if p2 in pkinases:
                temp = d[p2].copy()
                temp.add(p1)
                d[p2] = temp
                #substrates.add(p1)
                #totalNumInteractions += 1

    # write to file
    fileOutput = open('output_biogrid_all_{0}.gmt'.format(o), 'w+') # includes all kinases - substrate interactions
    fileOutput1 = open('output_biogrid_fourplusinteractions_{0}.gmt'.format(o), 'w+') # only includes kinases w/ >=4 substrate interactions
    numKinases = 0 # number of (unique) kinases
    for k in d:
        if len(d[k]) > 4:
            numKinases += 1
            temp = '{0}_biogrid_{1}	'.format(k, o)
            temp1 = '{0}_biogrid_{1}	'.format(k, o)
            for s in d[k]:
                temp = '{0}	{1}'.format(temp, s)
                temp1 = '{0}	{1}'.format(temp1, s)
                substrates.add(s)
                totalNumInteractions += 1
            fileOutput.write('{0}\n'.format(temp))
            fileOutput1.write('{0}\n'.format(temp1))
        elif len(d[k]) > 0:
            temp = '{0}_biogrid_{1}	'.format(k, o)
            for s in d[k]:
                temp = '{0}	{1}'.format(temp, s)
            fileOutput.write('{0}\n'.format(temp))
    
    
    print('{0}'.format(o))
    print('# unique kinases: {0}'.format(numKinases)) # number of kinases
    print('# ksi (total): {0}'.format(totalNumInteractions)) # total number of ksi
    print('# unique substrates: {0}'.format(len(substrates))) # number of unique substrates
    #print('avg # ksi (per k): {0}\n'.format(int(float(len(substrates)) / float(numKinases)))) # avg number of kinase - substrate interactions per kinase
    print('avg # ksi (per k): {0}\n'.format(int(float(totalNumInteractions) / float(numKinases)))) # avg number of kinase - substrate interactions per kinase


In [9]:
# testing:
biogrid_reformat1('human', 'input/BIOGRID-ORGANISM-3.5.173.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.5.173.tab2.txt', 9606)
biogrid_reformat1('mouse', 'input/BIOGRID-ORGANISM-3.5.173.tab2/BIOGRID-ORGANISM-Mus_musculus-3.5.173.tab2.txt', 10090)

human
# unique kinases: 296
# ksi (total): 23220
# unique substrates: 7391
avg # ksi (per k): 78

mouse
# unique kinases: 64
# ksi (total): 1180
# unique substrates: 760
avg # ksi (per k): 18

