# Combined .GMT File
This notebook contains code that will concatenate all of the .gmt files created for seven databases (RegPhos, NetworKIN, Swiss-Prot, PhosphoELM, PhosphoSite, HPRD, and MINT) into a single .gmt file. It will also show a distribution for the number of targets per kinase for this combined file. For the reference column of this file, citations to the paper corresponding to the database will be provided in cases during which the data does not have a pubmed id number. Otherwise, the PubMed id will be displayed.

## Import and Run Necessary Files

In [140]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy

## Retrieving and Downloading all .GMT files

In [141]:
HPRD = pd.read_table("~/Desktop/Projects/KEA3/HPRD/Uncondensed_HPRD.txt", header = None)

MINT = pd.read_table("~/Desktop/Projects/KEA3/MINT/Uncondensed_MINT.txt", header = None)

NetworKIN = pd.read_table("~/Desktop/Projects/KEA3/NetworKIN/Uncondensed_NetworKIN.txt", header = None)

PhosphoELM = pd.read_table("~/Desktop/Projects/KEA3/PhosphoELM/Uncondensed_PhosphoELM.txt", header = None)

PhosphoSite = pd.read_table("~/Desktop/Projects/KEA3/PhosphoSite/Uncondensed_PhosphoSite.txt", header = None)

RP_Hum = pd.read_table("~/Desktop/Projects/KEA3/RegPhos/Uncondensed_RegPhos_human.txt", header = None)

RP_Mouse = pd.read_table("~/Desktop/Projects/KEA3/RegPhos/Uncondensed_RegPhos_mouse.txt", header = None)

SP_E = pd.read_table("~/Desktop/Projects/KEA3/SwissProt/Uncondensed_SwissProtE.txt", header = None)

SP_P = pd.read_table("~/Desktop/Projects/KEA3/SwissProt/Uncondensed_SwissProt_Put.txt", header = None)

In [None]:
#Create a list of all of the dataframe
c = [HPRD, MINT, NetworKIN, PhosphoELM, PhosphoELM, RP_Hum, RP_Mouse, SP_E, SP_P]

#Concatenate all of the rows in the dataframes
combined = pd.concat(c)

#Assign column names
combined.columns = ['kinase_organism', 'substrate', 'reference']

#Drop any duplicate rows in the dataframe
combined.drop_duplicates(['kinase_organism', 'substrate'], inplace = True)
combined.dropna(inplace = True)

In [None]:
#View dataframe of combined kinase-substrate interactions
combined

In [None]:
#Create a dataframe copy of 'combined' known as 'df'
df = combined.copy()

#Set index of the dataframe as 'kinase_organism'
combined.set_index('kinase_organism', inplace = True)

#Combine rows according to same index 'kinase_organism'
kin = combined.groupby('kinase_organism').agg(lambda x: tuple(x))

In [None]:
df.head()

In [None]:
#View new dataframe
kin

In [None]:
# Create column representing counts of protein targets per kinase
kin['kinase_substrate_num'] = [len(lst) for kinase, lst in kin['substrate'].iteritems()]

# Sort kinases from max to min according to number of protein targets each has
kin.sort_values(by = ['kinase_substrate_num'], ascending= False, inplace=True)

# View dataframe
kin.head()

In [None]:
# Create histogram displaying the distribution of the number
#targets per kinase
kin.plot.hist(by = 'kinase_substrate_num', bins = 100)

#Show histogram
plt.show()

In [None]:
#Create list of indexes(kinase names) for kinases with fewer than five substrates
sig_key = [index for index, rowData in kin.kinase_substrate_num.iteritems() if rowData < 5 or rowData > 2000]

#Set indices for dataframe 'kin' of kinases with five or more substrates
#indices will also filter out kinases with more than 2000 substrates
indices = [index for index in kin.index if index not in sig_key]

#Create dataframe of only kinases with five or more substrates
#or 2000 or fewer substrates
sig = df.set_index('kinase_organism').loc[indices]

#Reset index of new dataframe
sig.reset_index(inplace = True)

#Rename columns of this dataframe to match .sig format
colnames = ['Source', 'Substrate', 'Reference']
sig.columns = colnames

In [None]:
#Insert first four columns for .sig file format (NaNs)
sig.insert(1, 'NA-1', str(np.nan))
sig.insert(2, 'NA-2', str(np.nan))
sig.insert(3, 'NA-3', str(np.nan))
sig.insert(4, 'NA-4', str(np.nan))

#Insert column corresponding to sign (with '?' because it is unknown)
sig.insert(6, 'Sign', '?')

#Insert column specifying interaction type as phosphorylation
sig.insert(7, 'Interaction', 'Phosphorylation')

#View dataframe
sig.head()

In [None]:
#Create dictionary 'sigd' with index numbers as keys
sigd = dict([(key, '') for key in sig.index])

# loop through rows with iterrows()
for index, rowData in sig.iterrows():
    if type(rowData) != type('Hello'):
        rowData = str(rowData)
    line = ('\t'.join(rowData))
    sigd[index] = line
    
#Transfer tab-separated info into a new txt file
#Make sure to include in ReadMe corresponding column names
with open('Combinedsig.txt', 'w') as openfile:
    for index in sigd:
        openfile.write(sigd[index] + '\n')

In [None]:
#Remove kinases with fewer than five substrates or 
#more than 2000 substrates in 'kin'
kin = kin.loc[indices]

In [None]:
#Reset index of the dataframe
kin.reset_index(inplace = True)

#create column 'acc_merged' in which all 'acc' elements are joined by a \t symbol
kin['substrates_merged'] = ['\t'.join(x) for x in kin['substrate']]

#drop the now-unneccesary column 'Substrates'
kin.drop('substrate', axis=1, inplace = True)

#also drop the data-exploratory column 'kinase_substrate_num'
kin.drop('kinase_substrate_num', axis=1, inplace = True)

#Create dictionary 'PhosphoSite' with index numbers as keys
Combined_num = dict([(key, '') for key in kin.index])

# loop through rows with iterrows()
for index, rowData in kin.iterrows():
    print(rowData)
    line = ('\t'.join(rowData))
    Combined_num[index] = line

In [None]:
#Transfer tab-separated info into a new txt file
with open('Combined.gmt', 'w') as openfile:
    for index in Combined_num:
        openfile.write(str(Combined_num[index]) + '\n')