# MINT Data Formatting (PPI) to SIG and GMT

# Formatting Data of Human Samples

In [None]:
import numpy as np 
import pandas as pd
import urllib.request 
import requests, io
import urllib, re, string
import sys, datetime

In [None]:
pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame

In [None]:
def concatliketerms(df):
    
    df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True)    
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    oldSize = df.shape
    newSize = 0
    j = 1
    while oldSize != newSize:
        df.reset_index(inplace=True)
        df.drop('index', axis=1, inplace=True)
        df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True) 
        # concattonate like terms and remove duplicates
        i = 1
        end = len(df.index)-1
        for index in df.index:
            if i % 1000 == 0:
                print('\r', 'Loop '+ str(j)+': ' +str(i)+' Out of '+ str(len(df.index)), end=' ', flush=True)
            if index in df.index.values and index != end:
                if df.ix[index, 'Protein A (gene name)'] == df.ix[index+1, 'Protein A (gene name)']:
                    if df.ix[index, 'Protein B (gene name)'] == df.ix[index+1, 'Protein B (gene name)']:
                        df.ix[index, 'PubMed ID'] = str(df.ix[index, 'PubMed ID']) +'|'+str(df.ix[index+1, 'PubMed ID'])
                        df.ix[index, 'Source databases'] = df.ix[index, 'Source databases'] +'|'+df.ix[index+1, 'Source databases']
                        df.drop(index+1, axis=0, inplace=True)
                        
            i += 1
        j += 1
        oldSize = newSize
        newSize = df.shape

In [None]:
# column identifiers for initial data (as taken from respective metadata)
col_name = ['Unique identifier for interactor A',
      'Unique identifier for interactor B',
      'Alternative identifier for interactor A',
      'Alternative identifier for interactor B',
      'Protein A (gene name)',
      'Protein B (gene name)',
      'Interaction detection methods',
      'First author',
      'PubMed ID',
      'NCBI Taxonomy identifier for interactor A',
      'NCBI Taxonomy identifier for interactor B',
      'Interaction types',
      'Source databases', 
      'Interaction identifier(s)',
      'Confidence score']

In [None]:
##read in data from Mint and write to file (only needs to be proformed once)
# content=urllib.request.urlopen('https://urldefense.proofpoint.com/v2/url?u=http-3A__mint.bio.uniroma2.it_mitab_MINT-5FMiTab.txt&d=DwIGAg&c=shNJtf5dKgNcPZ6Yh64b-A&r=FzzxHcpcdpeEl2SeS4RngXJfR5R4cEV8N2xULwaH8LE&m=H47AkOYhsWJidY3BTdZQzbIarlSka87imDJ8Qn9e6GY&s=Bv-GMiHdQIC4fRHi4JERcK71UMdrk1EvTGoRMpquPiM&e= ') 

# target = open('mint.tsv', 'wb')
# for line in content:
#     target.write(line)
# target.close()

In [None]:
mint_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/mint.tsv', sep='\t', index_col=False, header=None)

In [None]:
# assign names to columns from meta data
for i in np.arange(len(col_name)):
    mint_df.rename(columns={i:col_name[i]}, inplace=True)

In [None]:
# get only human (or mouse) data (first protein)
n = mint_df['Protein A (gene name)'].values
b = [i for i,item in enumerate(n) if "human" in item]
mint_df = mint_df.ix[b]

In [None]:
mint_df =  mint_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data (second protein)
n = mint_df['Protein B (gene name)'].values
b = [i for i,item in enumerate(n) if "human" in item]
mint_df = mint_df.ix[b]

In [None]:
# select only relevent data
mint_df = mint_df[['Protein A (gene name)', 'Protein B (gene name)', 'PubMed ID', 'Source databases']]

In [None]:
# change column one to just show gene name
#i = 1
lst = []
for name in mint_df['Protein A (gene name)']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Protein A (gene name)'] = lst

In [None]:
# change column two to just show gene name
#i = 1
lst = []
for name in mint_df['Protein B (gene name)']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Protein B (gene name)'] = lst

In [None]:
# drop data for which no gene name is provited (or ensamble ID)
mint_df.dropna(how='any', inplace=True, axis=0)

In [None]:
# Get Only PubMed ID for publication identifier
lst = []

for pub in mint_df['PubMed ID']:
    pub = str(pub)
    pub = pub.split('|')[1].split(':')[1]
    lst.append(pub)
mint_df['PubMed ID'] = lst

In [None]:
mint_uf_df = mint_df.copy()

In [None]:
concatliketerms(mint_uf_df)

In [None]:
mint_uf_df.head()

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_mint_human_unfiltered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_mint_human_unfiltered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False, compression='gzip') 

In [None]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
lst = []
for pub in mint_df['PubMed ID'].values:
    print('\r', str(i)+' Out of '+ str(len(mint_df['PubMed ID'].values)), end=' ', flush=True)
    lst.append(mint_df[mint_df['PubMed ID'] == pub].shape[0] <= 10)
    i +=1

mint_df = mint_df[lst]

In [None]:
# Drop data for which there is no PubMed ID
lst = []
for value in mint_df['PubMed ID']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

mint_df = mint_df.drop(mint_df[lst].index)

In [None]:
# Drop any data missing information
mint_df.dropna(inplace=True)

In [None]:
concatliketerms(mint_df)

In [None]:
mint_df.shape

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_mint_human_filtered_%s.tsv'% strhttps://urldefense.proofpoint.com/v2/url?u=http-3A__-28datetime.date.today&d=DwIGAg&c=shNJtf5dKgNcPZ6Yh64b-A&r=FzzxHcpcdpeEl2SeS4RngXJfR5R4cEV8N2xULwaH8LE&m=H47AkOYhsWJidY3BTdZQzbIarlSka87imDJ8Qn9e6GY&s=0AXIrr2R1Ws2uctDrGSv2BrKSII9GfdeCfYYet-im-Y&e= ()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_mint_human_filtered_%s.tsv.gz'% strhttps://urldefense.proofpoint.com/v2/url?u=http-3A__-28datetime.date.today&d=DwIGAg&c=shNJtf5dKgNcPZ6Yh64b-A&r=FzzxHcpcdpeEl2SeS4RngXJfR5R4cEV8N2xULwaH8LE&m=H47AkOYhsWJidY3BTdZQzbIarlSka87imDJ8Qn9e6GY&s=0AXIrr2R1Ws2uctDrGSv2BrKSII9GfdeCfYYet-im-Y&e= ()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False, compression='gzip') 

# Formatting Data of Mouse Samples

In [None]:
mint_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/mint.tsv', sep='\t', index_col=False, header=None)

In [None]:
# assign names to columns from meta data
for i in np.arange(len(col_name)):
    mint_df.rename(columns={i:col_name[i]}, inplace=True)

In [None]:
# get only human (or mouse) data
n = mint_df['Protein A (gene name)'].values
b = [i for i,item in enumerate(n) if "mouse" in item]
mint_df = mint_df.ix[b]

In [None]:
mint_df =  mint_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = mint_df['Protein B (gene name)'].values
b = [i for i,item in enumerate(n) if "mouse" in item]
mint_df = mint_df.ix[b]

In [None]:
# select only relevent data
mint_df = mint_df[['Protein A (gene name)', 'Protein B (gene name)', 'PubMed ID', 'Source databases']]

In [None]:
# change column one to just show gene name
#i = 1
lst = []
for name in mint_df['Protein A (gene name)']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Protein A (gene name)'] = lst

In [None]:
# change column two to just show gene name
#i = 1
lst = []
for name in mint_df['Protein B (gene name)']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Protein B (gene name)'] = lst

In [None]:
# drop data for which no gene name is provited (or ensamble ID)
mint_df.dropna(how='any', inplace=True, axis=0)

In [None]:
lst = []

for pub in mint_df['PubMed ID']:
    pub = str(pub)
    pub = pub.split('|')[1].split(':')[1]
    lst.append(pub)
mint_df['PubMed ID'] = lst

In [None]:
mint_df.drop_duplicates(inplace=True)

In [None]:
mint_uf_df = mint_df.copy()

In [None]:
concatliketerms(mint_uf_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_mint_mouse_unfiltered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_mint_mouse_unfiltered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False, compression='gzip') 

In [None]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
lst = []
for pub in mint_df['PubMed ID'].values:
    print('\r', str(i)+' Out of '+ str(len(mint_df['PubMed ID'].values)), end=' ', flush=True)
    lst.append(mint_df[mint_df['PubMed ID'] == pub].shape[0] <= 10)
    i +=1

mint_df = mint_df[lst]

In [None]:
# Drop data for which there is no PubMed ID
lst = []
for value in mint_df['PubMed ID']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

mint_df = mint_df.drop(mint_df[lst].index)

In [None]:
# Drop any data missing information
mint_df.dropna(inplace=True)

In [None]:
concatliketerms(mint_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_mint_mouse_filtered_%s.tsv'% strhttps://urldefense.proofpoint.com/v2/url?u=http-3A__-28datetime.date.today&d=DwIGaQ&c=shNJtf5dKgNcPZ6Yh64b-A&r=FzzxHcpcdpeEl2SeS4RngXJfR5R4cEV8N2xULwaH8LE&m=H47AkOYhsWJidY3BTdZQzbIarlSka87imDJ8Qn9e6GY&s=0AXIrr2R1Ws2uctDrGSv2BrKSII9GfdeCfYYet-im-Y&e= ()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_mint_mouse_filtered_%s.tsv.gz'% strhttps://urldefense.proofpoint.com/v2/url?u=http-3A__-28datetime.date.today&d=DwIGaQ&c=shNJtf5dKgNcPZ6Yh64b-A&r=FzzxHcpcdpeEl2SeS4RngXJfR5R4cEV8N2xULwaH8LE&m=H47AkOYhsWJidY3BTdZQzbIarlSka87imDJ8Qn9e6GY&s=0AXIrr2R1Ws2uctDrGSv2BrKSII9GfdeCfYYet-im-Y&e= ()).replace('-', '_')
mint_df.to_csv(filename, sep='\t', index=False, compression='gzip') 

# Create SIG File

In [None]:
sig_col = ['Source Name', 'Source Human Accession', 'Source Mouse Accession',
          'Source Type', 'Source Location', 'Target Name',
          'Target Human Accession', 'Target Mouse Accession', 'Target Type',
          'Target Location', 'Effect', 'Type of Interaction',
          'PubMed IDs']

In [None]:
path = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

In [None]:
# HOMOLOGENE
mapping_path = '~/./Documents/Harmonizome/PPI Library/Doc and Mapping/HomoloGene.tsv'
mapping = pd.read_csv(mapping_path, sep='\t', header=None)

In [None]:
mapping.set_index([0, 1], inplace=True)

In [None]:
mapping.head()

In [None]:
mint_human = pd.read_csv(path+'ppi_network_mint_human_filtered_2017_02_16.tsv', sep='\t')

In [None]:
mint_mouse = pd.read_csv(path+'ppi_network_mint_mouse_filtered_2017_02_15.tsv', sep='\t')

In [None]:
lst = []

for i, index in enumerate(mint_mouse.index):
    
    progressPercent = ((i+1)/len(mint_mouse.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mint_mouse.index)))
    sys.stdout.flush()
    
    gene = mint_mouse.ix[index, 'Protein A (gene name)']
    if gene in mapping[3].values:
        number = mapping[mapping[3] == gene].index
        if (number[0][0], 9606) in mapping.index:
            if type(mapping.ix[(number[0][0], 9606), 3]) == str:
                lst.append(mapping.ix[(number[0][0], 9606), 3])
            else:
                lst.append(np.nan)  # insert nan if gene maps tp more than one human gene  
        else:
            lst.append(np.nan)  # insert nan if gene dooesnt map to human gene       
    else:
        lst.append(np.nan) # insert nan if gene not found in the HOMOLOGENE

mint_mouse['Protein A (gene name)'] = lst

In [None]:
lst = []

for i, index in enumerate(mint_mouse.index):
    
    progressPercent = ((i+1)/len(mint_mouse.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mint_mouse.index)))
    sys.stdout.flush()
    
    gene = mint_mouse.ix[index, 'Protein B (gene name)']
    if gene in mapping[3].values:
        number = mapping[mapping[3] == gene].index
        if (number[0][0], 9606) in mapping.index:
            if type(mapping.ix[(number[0][0], 9606), 3]) == str:
                lst.append(mapping.ix[(number[0][0], 9606), 3])
            else:
                lst.append(np.nan)  # insert nan if gene maps tp more than one human gene  
        else:
            lst.append(np.nan)  # insert nan if gene dooesnt map to human gene       
    else:
        lst.append(np.nan) # insert nan if gene not found in the HOMOLOGENE

mint_mouse['Protein B (gene name)'] = lst

In [None]:
mint_mouse.shape

In [None]:
mint_mouse.dropna(how='any', inplace=True)

In [None]:
mint_mouse.shape

In [None]:
mint = mint_human.copy()

for i, index in enumerate(mint_mouse.index):
    
    progressPercent = ((i+1)/len(mint_mouse.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mint_mouse.index)))
    sys.stdout.flush()
    
    mouse_interaction = mint_mouse.ix[index, ['Protein A (gene name)', 'Protein B (gene name)']].values.tolist()
    if mouse_interaction in mint_human[['Protein A (gene name)', 'Protein B (gene name)']].values.tolist():
        human_index = mint_human[(mint_human['Protein A (gene name)'] == mouse_interaction[0]) & (mint_human['Protein B (gene name)'] == mouse_interaction[1])].index
        mouse_ref = mint_mouse.ix[index, 'PubMed ID']
        human_ref = mint_human.ix[index, 'PubMed ID']
        ref = ('|').join([mouse_ref, human_ref])
        mint.ix[human_index, 'PubMed ID'] = ref
    else:
        mint.append(mint_mouse.ix[index])

In [None]:
lst = []
for index in mint.index:
    lst.append(tuple(sorted(tuple((mint.ix[index,'Protein A (gene name)'], mint.ix[index, 'Protein B (gene name)'])))))
mint['ppi'] = lst

for i,ppi in enumerate(mint['ppi'].unique()):
    
    progressPercent = ((i+1)/len(mint['ppi'].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(mint['ppi'].unique())))
    sys.stdout.flush()
    
    ref = ('|').join(mint.ix[mint[mint['ppi'] == ppi].index, 'PubMed ID'])
    mint.ix[mint[mint['ppi'] == ppi].index[0], 'PubMed ID'] = ref
    mint.drop(mint[mint['ppi'] == ppi].index[1:], inplace=True)

In [None]:
mint_sig_df = pd.DataFrame(columns=sig_col)

mint_sig_df['Source Name'] = mint['Protein A (gene name)']

mint_sig_df['Target Name'] = mint['Protein B (gene name)']

mint_sig_df['PubMed IDs'] = mint['PubMed ID']

mint_sig_df.replace(np.nan, 'NA', inplace=True)

mint_ppiSIG = path+'mint_ppi_%s.sig'% str(datetime.date.today()).replace('-', '_')
mint_sig_df.to_csv(mint_ppiSIG, index=None, header=None, sep='\t')

# Create GMT File

In [1]:
#Define function 'sig_to_gmt' that converts a SIG file to a GMT file
def sig_to_gmt(sig_df):
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'protein_1': sig_df[0], 'protein_2': sig_df[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    d2 = {'protein_1': sig_df[5], 'protein_2': sig_df[0]}
    df_flipped = pd.DataFrame(data = d2)
    df_flipped.drop_duplicates(inplace = True)

    #Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
    extend = pd.concat([df, df_flipped])

    #Drop any duplicates and set index according to protein 1 so that we can aggregate 
    #all interacting protein 2's by a shared interaction with protein one
    extend.drop_duplicates(inplace = True)
    extend.set_index('protein_1', inplace = True)

    gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt.insert(0, 'Description', 'No Description')
    gmt.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt['merged'] = ['\t'.join(x) for x in gmt['protein_2']]
    gmt.drop('protein_2', axis = 1, inplace = True)
    gmt.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt.index])

    # loop through rows with iterrows()
    for index, rowData in gmt.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
    return gmt_d

In [2]:
#Use function to obtain dictionary of the data
mint_dict = sig_to_gmt(mint_sig_df)

#Transfer tab-separated info into a new gmt file
with open('mint_ppi.gmt', 'w') as openfile:
    for index in mint_dict:
        openfile.write(str(mint_dict[index]) + '\n')

NameError: name 'mint_sig_df' is not defined