# MINT Data Formatting
This file takes data regarding kinase-protein interactions from the MINT database and converts the data into the .gmt format. The data was retrieved from the MINT database on Thu, Jun 15 2017. This data will be added to enhance the KEA2 database and will be suitably formatted for use by ENRICHR and X2K. The citation for the data used in this script is "MINT, the molecular interaction database: 2012 update. Licata L, Briganti L, Peluso D, Perfetto L, Iannuccelli M, Galeota E, Sacco F, Palma A, Nardozza AP, Santonico E, Castagnoli L, Cesareni G. Nucleic Acids Res. 2012 Jan;40(Database issue):D857-61. doi: 10.1093/nar/gkr930. Epub 2011 Nov 16." and can also be found on the corresponding gh.pages website.

## Import packages necessary for following program

In [28]:
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/init.ipy
%run /home/maayanlab/Desktop/Projects/KEA3/Scripts/Scripts.py

## Create a dataframe from a file containing MINT data

In [29]:
#column names as retrieved from the MINT documentation
colnames = ['identifier_A', 'identifier_B', 'alt_A', 'alt_B', 'alias_A', 'alias_B',
           'interaction_detection_method', 'first author', 'identifier_of_pub', 
           'ncbi_taxid_A', 'ncbi_taxid_B', 'interaction_type', 'source_database',
           'interaction_identifier', 'confidence_score', 'complex_expansion', 
            'biorole_A', 'biorole_B', 'exp_role_A', 'exp_role_B', 'interactor_typeA'
           , 'interactor_typeB', 'Xref_A', 'Xref_B', 'xref_interaction', 'annot_A',
           'annot_B', 'ncbi_taxid_host', 'interaction_params', 'creation_date',
           'update_date', 'checksum_A', 'checksum_B', 'checksum_interaction', 'negative',
           'features_B', 'stoich_A', 'stoich_B', 'identification_method_A',
           'identification_method_B', 'psi-mi A', 'psi-mi B']

#read in data using full list of columns
m = pd.read_table('~/Desktop/Projects/KEA3/MINT_MiTab.txt', names = colnames)

In [30]:
#View dataframe
m.head()

Unnamed: 0,identifier_A,identifier_B,alt_A,alt_B,alias_A,alias_B,interaction_detection_method,first author,identifier_of_pub,ncbi_taxid_A,...,checksum_B,checksum_interaction,negative,features_B,stoich_A,stoich_B,identification_method_A,identification_method_B,psi-mi A,psi-mi B
0,uniprotkb:P22139,uniprotkb:P16370,intact:EBI-15802|uniprotkb:D6W2R6,intact:EBI-15773|uniprotkb:D6VVQ8,psi-mi:rpab5_yeast(display_long)|uniprotkb:YOR...,psi-mi:rpb3_yeast(display_long)|uniprotkb:YIL0...,"psi-mi:""MI:0676""(tandem affinity purification)",Krogan et al. (2006),mint:MINT-5218454|pubmed:16554755|imex:IM-15332,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...",...,rogid:cr2DgnGnKtGbd84HyGNM4xLS4FM559292,rogid:JMIGnvEpr4Scb2i5tqjqSV2wvEw559292,intact-crc:E217044E57716137|rigid:x2uppdTBgoEy...,False,calmodulin binding peptide plus protein a tag:...,-,1,1,"psi-mi:""MI:0078""(nucleotide sequence identific...","psi-mi:""MI:0658""(multidimensional protein iden..."
1,uniprotkb:P50106,uniprotkb:P22139,intact:EBI-15750|uniprotkb:D6VSD7,intact:EBI-15802|uniprotkb:D6W2R6,psi-mi:rpa14_yeast(display_long)|uniprotkb:RPA...,psi-mi:rpab5_yeast(display_long)|uniprotkb:YOR...,"psi-mi:""MI:0676""(tandem affinity purification)",Krogan et al. (2006),mint:MINT-5218454|pubmed:16554755|imex:IM-15332,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...",...,rogid:fIDAOOpJrLwSXy7vygwP4Cu96Ww559292,rogid:cr2DgnGnKtGbd84HyGNM4xLS4FM559292,intact-crc:F877914BAF04FA84|rigid:tsaqThE60JlS...,False,-,calmodulin binding peptide plus protein a tag:...,1,1,"psi-mi:""MI:0658""(multidimensional protein iden...","psi-mi:""MI:0078""(nucleotide sequence identific..."
2,uniprotkb:P22139,uniprotkb:P07703,intact:EBI-15802|uniprotkb:D6W2R6,intact:EBI-15831|uniprotkb:D6W4A9,psi-mi:rpab5_yeast(display_long)|uniprotkb:YOR...,psi-mi:rpac1_yeast(display_long)|uniprotkb:RPC...,"psi-mi:""MI:0676""(tandem affinity purification)",Krogan et al. (2006),mint:MINT-5218454|pubmed:16554755|imex:IM-15332,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...",...,rogid:cr2DgnGnKtGbd84HyGNM4xLS4FM559292,rogid:erepZ6yX0AYInV5koxEj9V9E2pI559292,intact-crc:BA67000B6C6FDB37|rigid:WI8zmM8YwnP0...,False,calmodulin binding peptide plus protein a tag:...,-,1,1,"psi-mi:""MI:0078""(nucleotide sequence identific...","psi-mi:""MI:0658""(multidimensional protein iden..."
3,uniprotkb:P04051,uniprotkb:P22139,intact:EBI-15810|uniprotkb:D6W2H5,intact:EBI-15802|uniprotkb:D6W2R6,psi-mi:rpc1_yeast(display_long)|uniprotkb:RPO3...,psi-mi:rpab5_yeast(display_long)|uniprotkb:YOR...,"psi-mi:""MI:0676""(tandem affinity purification)",Krogan et al. (2006),mint:MINT-5218454|pubmed:16554755|imex:IM-15332,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...",...,rogid:DoIDVE/XVbuQ8WSo+elSj7Ho5Ww559292,rogid:cr2DgnGnKtGbd84HyGNM4xLS4FM559292,intact-crc:29B003419662EB86|rigid:nFLdGkUKY75h...,False,-,calmodulin binding peptide plus protein a tag:...,1,1,"psi-mi:""MI:0658""(multidimensional protein iden...","psi-mi:""MI:0078""(nucleotide sequence identific..."
4,uniprotkb:P22139,uniprotkb:P16370,intact:EBI-15802|uniprotkb:D6W2R6,intact:EBI-15773|uniprotkb:D6VVQ8,psi-mi:rpab5_yeast(display_long)|uniprotkb:YOR...,psi-mi:rpb3_yeast(display_long)|uniprotkb:YIL0...,"psi-mi:""MI:0676""(tandem affinity purification)",Krogan et al. (2006),mint:MINT-5218454|pubmed:16554755|imex:IM-15332,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...",...,rogid:cr2DgnGnKtGbd84HyGNM4xLS4FM559292,rogid:JMIGnvEpr4Scb2i5tqjqSV2wvEw559292,intact-crc:8328974676378558|rigid:x2uppdTBgoEy...,False,calmodulin binding peptide plus protein a tag:...,-,1,1,"psi-mi:""MI:0078""(nucleotide sequence identific...","psi-mi:""MI:0658""(multidimensional protein iden..."


## Create dataframe with only data pertinent to .gmt and .sig files

In [None]:
#Filter only columns necessary for .gmt format
#We will be including all data, regardless of confidence score, so confidence
#score is not necessary for this dataframe
#for purpose of the .sig file, will also be using column 'identifier of pub'
mint = m[['identifier_A', 'identifier_B', 'ncbi_taxid_A', 'ncbi_taxid_B', 
          'interaction_type', 'biorole_A', 'biorole_B', 'identifier_of_pub']]

#Drop all duplicate values in the data
mint.drop_duplicates(['identifier_A', 'identifier_B', 'ncbi_taxid_A', 'ncbi_taxid_B', 
          'interaction_type', 'biorole_A', 'biorole_B'], inplace = True)

#Drop all NaNs in data
mint.dropna(inplace = True)

#View dataframe
mint.head(50)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,identifier_A,identifier_B,ncbi_taxid_A,ncbi_taxid_B,interaction_type,biorole_A,biorole_B,identifier_of_pub
0,uniprotkb:P22139,uniprotkb:P16370,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
1,uniprotkb:P50106,uniprotkb:P22139,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
2,uniprotkb:P22139,uniprotkb:P07703,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
3,uniprotkb:P04051,uniprotkb:P22139,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
5,uniprotkb:P22138,uniprotkb:P22139,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
6,uniprotkb:P04050,uniprotkb:P22139,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
7,uniprotkb:P22139,uniprotkb:P04050,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
8,uniprotkb:P34087,uniprotkb:P22139,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
9,uniprotkb:P22139,uniprotkb:P04051,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332
11,uniprotkb:P22139,uniprotkb:P08518,"taxid:559292(yeast)|taxid:559292(""Saccharomyce...","taxid:559292(yeast)|taxid:559292(""Saccharomyce...","psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0499""(unspecified role)","psi-mi:""MI:0499""(unspecified role)",mint:MINT-5218454|pubmed:16554755|imex:IM-15332


## Filter by interaction type to only choose phosphorylation reactions

In [None]:
#Obtain indices of interactions that are of type 'phosphorylation reaction'
#PSI - MI for a phosphorylation reaction is MI: 0217
indices = [index for index, rowData in mint.iterrows()
           if rowData['interaction_type'] == 'psi-mi:"MI:0217"(phosphorylation reaction)']

#Filter out dataframe 'mint' to choose only phosphorylation reactions
mint_filter = mint.loc[indices]

#View dataframe
mint_filter.head(50)

## Filter by Organism (Mouse and Human only)

In [None]:
# Replace taxid IDs for human and mouse with respective scientific names
#taxid_human - 9606
#taxid_mouse - 10090
mint_filter = mint_filter.replace('taxid:9606(human)|taxid:9606(Homo sapiens)',
                    'Homo sapiens')
mint_filter = mint_filter.replace('taxid:10090(mouse)|taxid:10090(Mus musculus)',
                    'Mus musculus')

#Filter dataframe for data only pertaining to either human or mouse species 
#for protein A
indices = [index for index, rowData in mint_filter.iterrows() if 
           rowData['ncbi_taxid_A'] == 'Homo sapiens' 
           or rowData['ncbi_taxid_A'] == 'Mus musculus']

mint_filter2 = mint_filter.loc[indices]

#Filter dataframe for data only pertaining to either human or mouse species 
#for protein B
indices2 = [index for index, rowData in mint_filter.iterrows() if 
           rowData['ncbi_taxid_B'] == 'Homo sapiens' 
           or rowData['ncbi_taxid_B'] == 'Mus musculus']

mint_filter3 = mint_filter2.loc[indices2]

indices3 = [index for index, rowData in mint_filter.iterrows() if 
           rowData['ncbi_taxid_B'] == rowData['ncbi_taxid_A']]

df = mint_filter3.loc[indices3]

#Drop any NaNs in the dataset (resulting from rows in which only one of the
#species was either human or mouse)
df.dropna(inplace = True)

In [None]:
#View dataframe
df.head(50)

## Convert UniProt IDs into Gene Symbols

In [None]:
#separate accession number from 'uniprotkb:' string for protein A
for index, string in df.identifier_A.iteritems():
    df.identifier_A[index] = string.split(":", 2)[1]

#Use uniprot_to_symbol function from Scripts.py to convert
df['gene_symbol_A'] = Scripts.uniprot_to_symbol(df['identifier_A'].tolist())

In [None]:
#separate accession number from 'uniprotkb:' string for protein B
for index, string in df.identifier_B.iteritems():
    df.identifier_B[index] = string.split(":", 2)[1]

#Use uniprot_to_symbol function from Scripts.py to convert
df['gene_symbol_B'] = Scripts.uniprot_to_symbol(df['identifier_B'].tolist())

In [None]:
#View Dataframe
df.head()


## Reformat dataframe

In [None]:
#Replace all instances of 'None' with 'NaN' and drop all  'NaN'
df = df.replace('None', np.nan)
df.dropna(inplace = True)

# Switch from PSI-MI ID to 'kinase' and 'target' labels
df = df.replace('psi-mi:"MI:0501"(enzyme)', 'Kinase')
df = df.replace('psi-mi:"MI:0502"(enzyme target)', 'Target')

#drop columns no longer needed
df.drop('identifier_A', axis=1, inplace = True)
df.drop('identifier_B', axis=1, inplace = True)
df.drop('interaction_type', axis=1, inplace = True)

In [None]:
#View dataframe
df.head()

## Sort Proteins by Kinase/Target Role and Specify Species of Kinase

In [None]:
#Add two columns 'Kinase_Organism' and 'Target'
df.insert(0, 'Kinase_organism', 'None')
df.insert(0, 'Target', 'None')

#Determine whether protein A or protein B is the kinase, and designate the two
#proteins into the 'Kinase_organism' or 'Target' columns accordingly
for index, rowData in df.biorole_A.iteritems():
    if rowData == 'Kinase':
        df.gene_symbol_A[index] = '_'.join([df.gene_symbol_A[index], df.ncbi_taxid_A[index]])
        df.Kinase_organism[index] = df.gene_symbol_A[index]
        df.Target[index] = df.gene_symbol_B[index]
    else:
        df.gene_symbol_B[index] =  '_'.join([df.gene_symbol_B[index], df.ncbi_taxid_B[index]])
        df.Kinase_organism[index] = df.gene_symbol_B[index]
        df.Target[index] = df.gene_symbol_A[index]

#Drop columns which are no longer needed in the dataframe
df.drop('gene_symbol_A', axis=1, inplace = True)
df.drop('gene_symbol_B', axis=1, inplace = True)
df.drop('ncbi_taxid_A', axis=1, inplace = True)
df.drop('ncbi_taxid_B', axis=1, inplace = True)
df.drop('biorole_A', axis=1, inplace = True)

#Create copy of df (with reference) as df_sig for future use in creating .sig file
df_sig = df.drop('biorole_B', axis=1)
df.drop('biorole_B', axis=1, inplace = True)

#View dataframe
df_sig.head()

df.drop('identifier_of_pub', axis = 1, inplace = True)

In [None]:
df.drop_duplicates(inplace = True)

## Set Index to 'Kinase_Organism' and Aggregate Kinase Targets

In [None]:
#Set index to 'Kinase_organism'
df.set_index('Kinase_organism')

#Group kinases in dataframe 'kin'
#Aggregate data in 'kin' according to kinase groups
kin = df.groupby('Kinase_organism').agg(lambda x: tuple(x))

#Create a new column 'MINT' as description of data
kin.insert(0, 'Description', 'MINT')

#Visualize Data
kin.head()

# Exploratory Data Analysis

## Calculate Number of Protein targets for each kinase
Create a new column with the number of substrates related to each kinase, and sort the dataframe by this column

In [None]:
# Create column representing counts of protein targets per kinase
kin['kinase_substrate_num'] = [len(lst) for kinase, lst in kin['Target'].iteritems()]

# Sort kinases from max to min according to number of protein targets each has
kin.sort_values(by = ['kinase_substrate_num'], ascending= False, inplace=True)

# View dataframe
kin.head()

## Create Histogram to display distribution of number of targets per kinase

In [None]:
# Create histogram displaying the distribution of the number
#targets per kinase
kin.plot.hist(by = 'kinase_substrate_num', bins = 63)

#Show histogram
plt.show()

## Filter Dataframe by the Number of Substrates for each Kinase
If the kinase has fewer than five targets, this kinase will not be included in the final .gmt file. Instead, its information will be carried over into a .sig file.

In [None]:
df_sig.head()

In [None]:
#Create list of indexes(kinase names) for kinases with fewer than five substrates
sig_key = [index for index, rowData in kin.kinase_substrate_num.iteritems() if rowData < 5 ]

#Set indices for dataframe 'kin' of kinases with five or more substrates
indices = [index for index in kin.index if index not in sig_key]

#Create dataframe of only kinases with five or fewer substrates
sig = df_sig.set_index('Kinase_organism').loc[indices]

#Reset index of new dataframe
sig.reset_index(inplace = True)

#Rename columns of this dataframe to match .sig format
colnames = ['Source', 'Substrate', 'PubMed_ID']
sig.columns = colnames


In [None]:
#Remove kinases with fewer than five substrates in 'kin'
kin = kin.loc[indices]

In [None]:
#drop any duplicates
sig.drop_duplicates(inplace = True)

### Reformat "PubMed_ID" column to display only PubMed ID with no other IDs

In [None]:
for index, string in sig.PubMed_ID.iteritems():
    s = string
    s = s.split("pubmed:", 2)[1]
    if "|" in s:
        s = s.split("|", 2)[0]
    sig.PubMed_ID[index] = s


## Create File of 'df_sig' for Later Data Aggregation

In [None]:
#Create dictionary with index numbers as keys
d = dict([(key, '') for key in sig.index])

# loop through rows with iterrows()
for index, rowData in sig.iterrows():
    line = ('\t'.join(rowData))
    d[index] = line
    
#Transfer tab-separated info into a new txt file
with open('Uncondensed_MINT.txt', 'w') as openfile:
    for index in d:
        openfile.write(str(d[index]) + '\n')

## Create .Sig File

In [None]:
#Insert first four columns for .sig file format (NaNs)
sig.insert(1, 'NA-1', str(np.nan))
sig.insert(2, 'NA-2', str(np.nan))
sig.insert(3, 'NA-3', str(np.nan))
sig.insert(4, 'NA-4', str(np.nan))

#Insert column corresponding to sign (with '?' because it is unknown)
sig.insert(6, 'Sign', '?')

#Insert column specifying interaction type as phosphorylation
sig.insert(7, 'Interaction', 'Phosphorylation')

#View dataframe
sig.head()

In [None]:
#Create dictionary 'sigd' with index numbers as keys
sigd = dict([(key, '') for key in df_sig.index])

# loop through rows with iterrows()
for index, rowData in df_sig.iterrows():
    line = ('\t'.join(rowData))
    sigd[index] = line
    
#Transfer tab-separated info into a new txt file
#Make sure to include in ReadMe corresponding column names
with open('MINTsig.txt', 'w') as openfile:
    for index in sigd:
        openfile.write(str(sigd[index]) + '\n')

# Creation of Final .GMT File

## Create dictionary of Tab-Separated Rows of the dataframe

In [None]:
#Reset index of the dataframe
kin.reset_index(inplace = True)

#create column 'acc_merged' in which all 'acc' elements are joined by a \t symbol
kin['target_merged'] = ['\t'.join(x) for x in kin['Target']]

#drop the now-unneccesary column 'Substrates'
kin.drop('Target', axis=1, inplace = True)

#also drop the data-exploratory column 'kinase_substrate_num'
kin.drop('kinase_substrate_num', axis=1, inplace = True)

#Create dictionary 'MINT' with index numbers as keys
MINT_num = dict([(key, '') for key in kin.index])

# loop through rows with iterrows()
for index, rowData in kin.iterrows():
    line = ('\t'.join(rowData))
    MINT_num[index] = line

## Write info from Dictionary into a .GMT file

In [None]:
#Transfer tab-separated info into a new txt file
with open('MINT.gmt', 'w') as openfile:
    for index in MINT_num:
        openfile.write(str(MINT_num[index]) + '\n')