# MINT

Author: Moshe C. Silverstein  
Date: 07-2017

In [1]:
import sys
import numpy as np
import pandas as pd
import Utilities
from Utilities import *
from importlib import reload

# Load Data

Data Retrieved 1/23/2017  

http://mint.bio.uniroma2.it/mitab/MINT_MiTab.txt (Mint)

In [2]:
mint_df = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Input/mint.tsv', sep='\t', index_col=False, header=None)

# Load Gene Mapping File

In [3]:
mappingFile = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Doc and Mapping/mappingFile_2017.txt', sep='\t', header=None, index_col=0)

# Select Relevent Data

In [4]:
# column identifiers for initial data (as taken from respective metadata)
col_name = ['Unique identifier for interactor A',
      'Unique identifier for interactor B',
      'Alternative identifier for interactor A',
      'Alternative identifier for interactor B',
      'Protein A (gene name)',
      'Protein B (gene name)',
      'Interaction detection methods',
      'First author',
      'PubMed ID',
      'NCBI Taxonomy identifier for interactor A',
      'NCBI Taxonomy identifier for interactor B',
      'Interaction types',
      'Source databases', 
      'Interaction identifier(s)',
      'Confidence score']

In [5]:
# assign names to columns from meta data
for i in np.arange(len(col_name)):
    mint_df.rename(columns={i:col_name[i]}, inplace=True)

In [6]:
# get only human and mouse data 

n = mint_df['Protein A (gene name)'].values
b = [i for i,item in enumerate(n) if "human" in item or "mouse" in item]
mint_df = mint_df.ix[b]

mint_df =  mint_df.reset_index().drop('index', axis=1)

n = mint_df['Protein B (gene name)'].values
b = [i for i,item in enumerate(n) if "human" in item or "mouse" in item]
mint_df = mint_df.ix[b]

In [7]:
# select only relevent data
mint_df = mint_df[['Protein A (gene name)', 'Protein B (gene name)', 'PubMed ID', 'Source databases']]

In [8]:
# change columns to just show gene name
changePPIToShowGeneName(mint_df)

Progress: 100%  54480 Out of 54480   

In [9]:
mint_df.head()

Unnamed: 0,Protein A (gene name),Protein B (gene name),PubMed ID,Source databases
0,Rgs12,Dlgap3,mint:MINT-5219791|pubmed:17380122|imex:IM-11437,"psi-mi:""MI:0471""(MINT)"
1,FBXO5,SKP1,mint:MINT-5219791|pubmed:17380122|imex:IM-11437,"psi-mi:""MI:0471""(MINT)"
2,Rgs12,Map2k2,mint:MINT-5219791|pubmed:17380122|imex:IM-11437,"psi-mi:""MI:0471""(MINT)"
3,Epn1,Itsn1,mint:MINT-5219791|pubmed:17380122|imex:IM-11437,"psi-mi:""MI:0471""(MINT)"
4,Rgs12,Map2k2,mint:MINT-5219791|pubmed:17380122|imex:IM-11437,"psi-mi:""MI:0471""(MINT)"


# Map Gene to human and updated approved symbols

In [10]:
mapgenesymbols(mint_df, mappingFile, 'Protein A (gene name)', 'Protein B (gene name)')

Progeres: 100%  54480 Out of 54480   

In [11]:
mint_df.shape

(52031, 4)

# Get PubMed Ids

In [12]:
# Get Only PubMed ID for publication identifier
getPubMedIds(mint_df, 'PubMed ID')

# Unfiltered

In [13]:
# uf = unfiltered
mint_uf_df = mint_df.copy()

### Combine duplicate ppis while concatanatig referances

In [14]:
combineDupPPIs(mint_uf_df)

Progress: 100%  25468 Out of 25468   

### Create .sig File

In [15]:
pathU = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Unfiltered/'

createSigFile(mint_uf_df, pathU, 'mint', False)

# Filtered

In [16]:
mint_f_df = mint_df.copy()

In [17]:
# drop any data that was published with more then 10 PPI's per publication or doesnt have a ppi
filterPPIbyPubmed(mint_f_df, 10)

Progress: 116%  3639 Out of 3129   

### Combine duplicate ppis while concatanatig referances

In [18]:
combineDupPPIs(mint_f_df)

Progress: 100%  5103 Out of 5103   

### Create .sig File

In [19]:
pathF = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

createSigFile(mint_f_df, pathF, 'mint', True)