# KEA Data Formatting to SIG and GMT Files

# Formatting Data of Human Samples

Import necessary modules

In [None]:
import numpy as np 
import pandas as pd
import urllib.request 
import requests, io
import urllib, re, string
import sys, datetime

Include for nicer format of tables when printing

In [None]:
pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame

In [None]:
def concatliketerms(df):
    
    df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True)    
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    oldSize = df.shape
    newSize = 0
    j = 1
    while oldSize != newSize:
        df.reset_index(inplace=True)
        df.drop('index', axis=1, inplace=True)
        df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True) 
        # concattonate like terms and remove duplicates
        i = 1
        end = len(df.index)-1
        for index in df.index:
            if i % 1000 == 0:
                print('\r', 'Loop '+ str(j)+': ' +str(i)+' Out of '+ str(len(df.index)), end=' ', flush=True)
            if index in df.index.values and index != end:
                if df.ix[index, 'Protein A (gene name)'] == df.ix[index+1, 'Protein A (gene name)']:
                    if df.ix[index, 'Protein B (gene name)'] == df.ix[index+1, 'Protein B (gene name)']:
                        df.ix[index, 'PubMed ID'] = str(df.ix[index, 'PubMed ID']) +'|'+str(df.ix[index+1, 'PubMed ID'])
                        df.ix[index, 'Source databases'] = df.ix[index, 'Source databases'] +'|'+df.ix[index+1, 'Source databases']
                        df.drop(index+1, axis=0, inplace=True)
                        
            i += 1
        j += 1
        oldSize = newSize
        newSize = df.shape

In [None]:
kea_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/KEA/kinase-protein_interactions.csv', sep=',', header=None)

In [None]:
kea_df.head()

In [None]:
kea_ph_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/KEA/phosphorylation_reactions.csv', sep=',', header=None)

In [None]:
kea_ph_df.head()

In [None]:
kea_df = pd.concat([kea_df, kea_ph_df])

In [None]:
kea_df.shape

In [None]:
kea_df.drop_duplicates(inplace=True)

In [None]:
kea_df.shape

In [None]:
kea_df.reset_index(inplace=True)
kea_df.drop('index', axis=1, inplace=True)

In [None]:
#i = 1
for index in kea_df.index:
    #print('\r', str(i)+' Out of '+ str(len(kea_df.index)), end=' ', flush=True)
    kea_df.ix[index, 4] = kea_df.ix[index,4].replace(';', '|')
    kea_df.ix[index, 5] = kea_df.ix[index,5].replace(';', '|')
    #i+=1

In [None]:
kea_df = kea_df[[2,3,4,5]]

In [None]:
kea_df.rename(columns={2: 'Protein A (gene name)', 3:'Protein B (gene name)', 4:'PubMed ID', 5:'Source databases'}, inplace=True)

In [None]:
# drop data for which no gene name is provited (or ensamble ID)
kea_df.dropna(how='any', inplace=True, axis=0)

In [None]:
kea_df.drop_duplicates(inplace=True)

In [None]:
kea_uf_df = kea_df.copy()

In [None]:
concatliketerms(kea_uf_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_kea_human_unfiltered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
kea_uf_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_kea_human_unfiltered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
kea_uf_df.to_csv(filename, sep='\t', index=False, compression='gzip')

In [None]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in kea_df['PubMed ID'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(kea_df['PubMed ID'].unique())), end=' ', flush=True)
    if kea_df[kea_df['PubMed ID'] == pub].shape[0] > 10:
        kea_df.drop(kea_df[kea_df['PubMed ID'] == pub].index.tolist(), inplace=True)
    i +=1

In [None]:
kea_df.dropna(inplace=True)

In [None]:
concatliketerms(kea_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_kea_human_filtered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
kea_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_kea_human_filtered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
kea_df.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create SIG File

In [None]:
sig_col = ['Source Name', 'Source Human Accession', 'Source Mouse Accession',
          'Source Type', 'Source Location', 'Target Name',
          'Target Human Accession', 'Target Mouse Accession', 'Target Type',
          'Target Location', 'Effect', 'Type of Interaction',
          'PubMed IDs']

In [None]:
path = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

In [None]:
# HOMOLOGENE
mapping_path = '~/./Documents/Harmonizome/PPI Library/Doc and Mapping/HomoloGene.tsv'
mapping = pd.read_csv(mapping_path, sep='\t', header=None)

In [None]:
mapping.set_index([0, 1], inplace=True)

In [None]:
mapping.head()

In [None]:
kea_human = pd.read_csv(path+'ppi_network_kea_human_filtered_2017_02_16.tsv', sep='\t')

In [None]:
kea = kea_human.copy()

In [None]:
lst = []
for index in kea.index:
    lst.append(tuple(sorted(tuple((kea.ix[index,'Protein A (gene name)'], kea.ix[index, 'Protein B (gene name)'])))))
kea['ppi'] = lst

for i,ppi in enumerate(kea['ppi'].unique()):
    
    progressPercent = ((i+1)/len(kea['ppi'].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(kea['ppi'].unique())))
    sys.stdout.flush()
    
    ref = ('|').join(kea.ix[kea[kea['ppi'] == ppi].index, 'PubMed ID'])
    kea.ix[kea[kea['ppi'] == ppi].index[0], 'PubMed ID'] = ref
    kea.drop(kea[kea['ppi'] == ppi].index[1:], inplace=True)

In [None]:
kea_sig_df = pd.DataFrame(columns=sig_col)

kea_sig_df['Source Name'] = kea['Protein A (gene name)']

kea_sig_df['Target Name'] = kea['Protein B (gene name)']

kea_sig_df['PubMed IDs'] = kea['PubMed ID']

kea_sig_df.replace(np.nan, 'NA', inplace=True)

kea_ppiSIG = path+'kea_ppi_%s.sig'% str(datetime.date.today()).replace('-', '_')
kea_sig_df.to_csv(kea_ppiSIG, index=None, header=None, sep='\t')

# Create GMT File

In [None]:
#Define function 'sig_to_gmt' that converts a SIG file to a GMT file
def sig_to_gmt(sig_df):
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'protein_1': sig_df[0], 'protein_2': sig_df[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    d2 = {'protein_1': sig_df[5], 'protein_2': sig_df[0]}
    df_flipped = pd.DataFrame(data = d2)
    df_flipped.drop_duplicates(inplace = True)

    #Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
    extend = pd.concat([df, df_flipped])

    #Drop any duplicates and set index according to protein 1 so that we can aggregate 
    #all interacting protein 2's by a shared interaction with protein one
    extend.drop_duplicates(inplace = True)
    extend.set_index('protein_1', inplace = True)

    gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['protein_2'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt.insert(0, 'Description', 'No Description')
    gmt.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt['merged'] = ['\t'.join(x) for x in gmt['protein_2']]
    gmt.drop('protein_2', axis = 1, inplace = True)
    gmt.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt.index])

    # loop through rows with iterrows()
    for index, rowData in gmt.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
    return gmt_d

In [None]:
#Use function to obtain dictionary of the data
kea_dict = sig_to_gmt(kea_sig_df)

#Transfer tab-separated info into a new gmt file
with open('kea_ppi.gmt', 'w') as openfile:
    for index in kea_dict:
        openfile.write(str(kea_dict[index]) + '\n')