# Mentha

Author: Moshe C. Silverstein  
Date: 07-2017

In [1]:
import sys
import numpy as np
import pandas as pd
import Utilities
from Utilities import *
from importlib import reload

# Load Data

Data Retrieved 1/26/2017 

http://mentha.uniroma2.it/doDownload.php?file=organisms/9606.zip (mentha) 

In [2]:
menthaH_df = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Input/mentha_human', sep=';')

In [3]:
menthaH_df.head()

Unnamed: 0,Protein A,Gene A,Protein B,Gene B,Score,PMID
0,Q9NX31,OSER1,P28065,PSMB9,0.126,26186194
1,Q9BVA1,TUBB2B,O43318,MAP3K7,0.126,14743216
2,Q9BVA1,TUBB2B,O43353,RIPK2,0.126,14743216
3,Q9BVA1,TUBB2B,O95834,EML2,0.126,26186194
4,Q9BVA1,TUBB2B,P19438,TNFRSF1A,0.126,14743216


In [4]:
menthaM_df = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Input/mentha_mouse', sep=';')

In [5]:
menthaM_df.head()

Unnamed: 0,Protein A,Gene A,Protein B,Gene B,Score,PMID
0,Q5BJ29,FBXL7,P97477,AURKA,0.309,22306998
1,Q9R0P9,UCHL1,P17047,LAMP2,0.21,18550537
2,Q61411,HRAS,Q3UYI5,RGL3,0.376,10869344
3,Q8R3Z5,CACNB1,Q3UYC8,ADNP,0.183,15102471
4,Q80U70,SUZ12,Q3UXZ9,KDM5A,0.332,20064375


In [6]:
mentha_df = pd.concat([menthaH_df, menthaM_df])

In [7]:
mentha_df = mentha_df.reset_index().drop('index', axis=1)

In [8]:
mentha_df.head()

Unnamed: 0,Protein A,Gene A,Protein B,Gene B,Score,PMID
0,Q9NX31,OSER1,P28065,PSMB9,0.126,26186194
1,Q9BVA1,TUBB2B,O43318,MAP3K7,0.126,14743216
2,Q9BVA1,TUBB2B,O43353,RIPK2,0.126,14743216
3,Q9BVA1,TUBB2B,O95834,EML2,0.126,26186194
4,Q9BVA1,TUBB2B,P19438,TNFRSF1A,0.126,14743216


# Load Gene Mapping File

In [9]:
mappingFile = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Doc and Mapping/mappingFile_2017.txt', sep='\t', header=None, index_col=0)

# Select Relevent Data

In [10]:
mentha_df = mentha_df[['Gene A', 'Gene B', 'PMID']]

In [11]:
mentha_df['Source databases'] = '(mentha)'

In [12]:
mentha_df.rename(columns={'Gene A': 'Protein A (gene name)', 'Gene B':'Protein B (gene name)', 'PMID':'PubMed ID'}, inplace=True)

# Map Gene to human and updated approved symbols

In [13]:
mapgenesymbols(mentha_df, mappingFile, 'Protein A (gene name)', 'Protein B (gene name)')

Progeres: 100%  285881 Out of 285881   

# Get PubMed Ids

In [14]:
# Get Only PubMed ID for publication identifier

for i,index in enumerate(mentha_df.index):
            
    progressPercent = ((i+1)/len(mentha_df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mentha_df.index)))
    sys.stdout.flush()

    mentha_df.ix[index, 'PubMed ID'] = '|'.join(mentha_df.ix[index, 'PubMed ID'].split(' ')[0:-1])


Progeres: 100%  273170 Out of 273170   

# Unfiltered

In [15]:
# uf = unfiltered
mentha_uf_df = mentha_df.copy()

### Combine duplicate ppis while concatanatig referances

In [16]:
combineDupPPIs(mentha_uf_df)

### Create .sig File

In [17]:
pathU = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Unfiltered/'

createSigFile(mentha_uf_df, pathU, 'mentha', False)

# Filtered

In [18]:
mentha_f_df = mentha_df.copy()

In [19]:
# drop any data that was published with more then 10 PPI's per publication or doesnt have a ppi
filterPPIbyPubmed(mentha_f_df, 10)

Progress: 102%  36184 Out of 35143   

### Combine duplicate ppis while concatanatig referances

In [20]:
combineDupPPIs(mentha_f_df)

Progress: 100%  54409 Out of 54409   

### Create .sig File

In [21]:
pathF = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

createSigFile(mentha_f_df, pathF, 'mentha', True)

## Create .gmt File

In [9]:
# Download both filtered and unfiltered SIG files

#Download UNFILTERED data
mentha_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/mentha/mentha_unfiltered_ppi_2017_07_13.sig", header = None)

#Download FILTERED data
#mentha_sig = pd.read_table("~/Desktop/Projects/KEA3/PPI/mentha/mentha_filtered_ppi_2017_07_13.sig", header = None)

In [10]:
#convert a SIG file to a GMT file
#Create two versions of df 'd1' --> one flipped and one which is in original order
d1 = {'protein_1': mentha_sig[0], 'protein_2': mentha_sig[5]}
df = pd.DataFrame(data = d1)
df.drop_duplicates(inplace = True)

d2 = {'protein_1': mentha_sig[5], 'protein_2': mentha_sig[0]}
df_flipped = pd.DataFrame(data = d2)
df_flipped.drop_duplicates(inplace = True)

#Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
extend = pd.concat([df, df_flipped])

#Drop any duplicates and set index according to protein 1 so that we can aggregate 
#all interacting protein 2's by a shared interaction with protein one
extend.drop_duplicates(inplace = True)
extend.set_index('protein_1', inplace = True)

gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

# Create column representing counts of protein interactions per protein
gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

# Sort proteins from max to min according to number of protein iteractions
gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

#Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

#filter dataframe by these indices
gmt = gmt.loc[indices]

gmt_2 = gmt.copy()

#reset index and insert a column for the description
gmt_2.insert(0, 'Description', 'mentha')
gmt_2.reset_index(inplace = True)

#Drop columns not needed in GMT and join all protein interactions by a tab
gmt_2['merged'] = ['\t'.join(x) for x in gmt_2['protein_2']]
gmt_2.drop('protein_2', axis = 1, inplace = True)
gmt_2.drop('interactions', axis = 1, inplace = True)

#create a dictionary and store in it rowData corresponding to each protein
gmt_d = dict([(key, '') for key in gmt_2.index])

# loop through rows with iterrows()
for index, rowData in gmt_2.iterrows():
    line = ('\t'.join(rowData))
    gmt_d[index] = line

## Print GMT to File and Gather Statistics

Code Below Written By: Marina Latif

In [11]:
genes_term = gmt.interactions

len(genes_term)

11856

In [12]:
sum(genes_term)

512389

In [13]:
avg_num_terms = genes_term.mean(axis = 0)
avg_num_terms

43.21769568151147

In [14]:
#For inclusion on website as a statistic, calculate 
#the total number of unique terms for the dataset
stat_df = extend.loc[indices]

stat_df.reset_index(inplace=True)

all_terms = pd.concat([stat_df.protein_1, stat_df.protein_2], axis = 0)
len(all_terms.unique())

15719

In [15]:
#Transfer tab-separated info into a new gmt file
with open('mentha_ppi_unfiltered.gmt', 'w') as openfile:
    for index in gmt_d:
        openfile.write(str(gmt_d[index]) + '\n')