# DIP Data Formatting into SIG and GMT Files

# Formatting Data of Human Species

Import necessary modules

In [None]:
import numpy as np 
import pandas as pd
import urllib.request 
import requests, io
import urllib, re, string
import sys, datetime

Include for nicer format of tables when printing

In [None]:
pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame

In [None]:
def concatliketerms(df):
    
    df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True)    
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    oldSize = df.shape
    newSize = 0
    j = 1
    while oldSize != newSize:
        df.reset_index(inplace=True)
        df.drop('index', axis=1, inplace=True)
        df.sort_values(by=['Protein A (gene name)', 'Protein B (gene name)'], inplace=True) 
        # concattonate like terms and remove duplicates
        i = 1
        end = len(df.index)-1
        for index in df.index:
            if i % 1000 == 0:
                print('\r', 'Loop '+ str(j)+': ' +str(i)+' Out of '+ str(len(df.index)), end=' ', flush=True)
            if index in df.index.values and index != end:
                if df.ix[index, 'Protein A (gene name)'] == df.ix[index+1, 'Protein A (gene name)']:
                    if df.ix[index, 'Protein B (gene name)'] == df.ix[index+1, 'Protein B (gene name)']:
                        df.ix[index, 'PubMed ID'] = str(df.ix[index, 'PubMed ID']) +'|'+str(df.ix[index+1, 'PubMed ID'])
                        df.ix[index, 'Source databases'] = df.ix[index, 'Source databases'] +'|'+df.ix[index+1, 'Source databases']
                        df.drop(index+1, axis=0, inplace=True)
                        
            i += 1
        j += 1
        oldSize = newSize
        newSize = df.shape

In [None]:
dip_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/dip20160731.txt', sep='\t', index_col=False)

In [None]:
# get only human (or mouse) data
n = dip_df['Taxid interactor A'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item]
dip_df = dip_df.ix[b]

In [None]:
dip_df =  dip_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = dip_df['Taxid interactor B'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item]
dip_df = dip_df.ix[b]

In [None]:
dip_df = dip_df[['ID interactor A', 'ID interactor B', 'Publication Identifier(s)', 'Source database(s)']]

In [None]:
dip_df.rename(columns={'ID interactor A': 'Protein A (gene name)', 'ID interactor B':'Protein B (gene name)', 'Publication Identifier(s)':'PubMed ID', 'Source database(s)':'Source databases'}, inplace=True)

#### mapping table to convert labels from uniprot to ncbi names

In [None]:
mapping_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Doc and Mapping/HUMAN_9606_idmapping.dat', sep='\t', index_col=False, names=['UniProtKB-AC', 'ID_type', 'ID'])

In [None]:
mapping_df = mapping_df[mapping_df['ID_type']=='Gene_Name']

In [None]:
mapping_df.set_index('UniProtKB-AC', inplace=True)

In [None]:
mapping_df = mapping_df[~mapping_df.index.duplicated(keep='first')]

In [None]:
# change column one to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Protein A (gene name)'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Protein A (gene name)'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Protein A (gene name)'] = np.nan
    else:
        dip_df.ix[index, 'Protein A (gene name)'] = np.nan

In [None]:
# change column two to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Protein B (gene name)'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Protein B (gene name)'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Protein B (gene name)'] = np.nan
    else:
        dip_df.ix[index, 'Protein B (gene name)'] = np.nan

In [None]:
lst = []

for pub in dip_df['PubMed ID']:
    pub = str(pub)
    pub = pub.split('|')[0].split(':')[1]
    lst.append(pub)
dip_df['PubMed ID'] = lst

In [None]:
dip_df.drop_duplicates(inplace=True)

In [None]:
dip_uf_df = dip_df.copy()

In [None]:
concatliketerms(dip_uf_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_dip_human_unfiltered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
dip_uf_df.to_csv(filename, sep='\t', index=False)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_dip_human_unfiltered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
dip_uf_df.to_csv(filename, sep='\t', index=False, compression='gzip')

In [None]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in dip_df['PubMed ID'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(dip_df['PubMed ID'].unique())), end=' ', flush=True)
    if dip_df[dip_df['PubMed ID'] == pub].shape[0] > 10:
        dip_df.drop(dip_df[dip_df['PubMed ID'] == pub].index.tolist(), inplace=True)
    i +=1

In [None]:
lst = []
for value in dip_df['PubMed ID']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

dip_df = dip_df.drop(dip_df[lst].index)

In [None]:
dip_df.dropna(inplace=True)

In [None]:
concatliketerms(dip_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_dip_human_filtered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
dip_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_dip_human_filtered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
dip_df.to_csv(filename, sep='\t', index=False, compression='gzip')

# Formatting Data of Mouse Species

In [None]:
dip_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/dip20160731.txt', sep='\t', index_col=False)

In [None]:
# get only human (or mouse) data
n = dip_df['Taxid interactor A'].values
b = [i for i,item in enumerate(n) if "Mus musculus" in item]
dip_df = dip_df.ix[b]

In [None]:
dip_df =  dip_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = dip_df['Taxid interactor B'].values
b = [i for i,item in enumerate(n) if "Mus musculus" in item]
dip_df = dip_df.ix[b]

In [None]:
dip_df = dip_df[['ID interactor A', 'ID interactor B', 'Publication Identifier(s)', 'Source database(s)']]

In [None]:
dip_df.rename(columns={'ID interactor A': 'Protein A (gene name)', 'ID interactor B':'Protein B (gene name)', 'Publication Identifier(s)':'PubMed ID', 'Source database(s)':'Source databases'}, inplace=True)

#### mapping table to convert labels from uniprot to ncbi names

In [None]:
mapping_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Doc and Mapping/MOUSE_10090_idmapping.dat', sep='\t', index_col=False, names=['UniProtKB-AC', 'ID_type', 'ID'])

In [None]:
mapping_df = mapping_df[mapping_df['ID_type']=='Gene_Name']

In [None]:
mapping_df.set_index('UniProtKB-AC', inplace=True)

In [None]:
mapping_df = mapping_df[~mapping_df.index.duplicated(keep='first')]

In [None]:
# change column one to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Protein A (gene name)'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Protein A (gene name)'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Protein A (gene name)'] = np.nan
    else:
        dip_df.ix[index, 'Protein A (gene name)'] = np.nan

In [None]:
# change column two to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Protein B (gene name)'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Protein B (gene name)'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Protein B (gene name)'] = np.nan
    else:
        dip_df.ix[index, 'Protein B (gene name)'] = np.nan

In [None]:
lst = []

for pub in dip_df['PubMed ID']:
    pub = str(pub)
    pub = pub.split('|')[0].split(':')[1]
    lst.append(pub)
dip_df['PubMed ID'] = lst

In [None]:
dip_df.drop_duplicates(inplace=True)

In [None]:
dip_uf_df = dip_df.copy()

In [None]:
concatliketerms(dip_uf_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_dip_mouse_unfiltered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
dip_uf_df.to_csv(filename, sep='\t', index=False)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Unfiltered/ppi_network_dip_mouse_unfiltered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
dip_uf_df.to_csv(filename, sep='\t', index=False, compression='gzip')

In [None]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in dip_df['PubMed ID'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(dip_df['PubMed ID'].unique())), end=' ', flush=True)
    if dip_df[dip_df['PubMed ID'] == pub].shape[0] > 10:
        dip_df.drop(dip_df[dip_df['PubMed ID'] == pub].index.tolist(), inplace=True)
    i +=1

In [None]:
lst = []
for value in dip_df['PubMed ID']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

dip_df = dip_df.drop(dip_df[lst].index)

In [None]:
dip_df.dropna(inplace=True)

In [None]:
concatliketerms(dip_df)

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_dip_mouse_filtered_%s.tsv'% str(datetime.date.today()).replace('-', '_')
dip_df.to_csv(filename, sep='\t', index=False) 

In [None]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/IndividualResources/Filtered/ppi_network_dip_mouse_filtered_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
dip_df.to_csv(filename, sep='\t', index=False, compression='gzip')

# Creating SIG File

In [None]:
sig_col = ['Source Name', 'Source Human Accession', 'Source Mouse Accession',
          'Source Type', 'Source Location', 'Target Name',
          'Target Human Accession', 'Target Mouse Accession', 'Target Type',
          'Target Location', 'Effect', 'Type of Interaction',
          'PubMed IDs']

In [None]:
path = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

In [None]:
# HOMOLOGENE
mapping_path = '~/./Documents/Harmonizome/PPI Library/Doc and Mapping/HomoloGene.tsv'
mapping = pd.read_csv(mapping_path, sep='\t', header=None)

In [None]:
mapping.set_index([0, 1], inplace=True)

In [None]:
mapping.head()

In [None]:
dip_human = pd.read_csv(path+'ppi_network_dip_human_filtered_2017_02_15.tsv', sep='\t')

In [None]:
dip_human.head()

In [None]:
dip_mouse = pd.read_csv(path+'ppi_network_dip_mouse_filtered_2017_02_15.tsv', sep='\t')

In [None]:
dip_mouse.head()

In [None]:
lst = []

for i, index in enumerate(dip_mouse.index):
    
    progressPercent = ((i+1)/len(dip_mouse.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(dip_mouse.index)))
    sys.stdout.flush()
    
    gene = dip_mouse.ix[index, 'Protein A (gene name)']
    if gene in mapping[3].values:
        number = mapping[mapping[3] == gene].index
        if (number[0][0], 9606) in mapping.index:
            if type(mapping.ix[(number[0][0], 9606), 3]) == str:
                lst.append(mapping.ix[(number[0][0], 9606), 3])
            else:
                lst.append(np.nan)  # insert nan if gene maps tp more than one human gene  
        else:
            lst.append(np.nan)  # insert nan if gene dooesnt map to human gene       
    else:
        lst.append(np.nan) # insert nan if gene not found in the HOMOLOGENE

dip_mouse['Protein A (gene name)'] = lst

In [None]:
lst = []

for i, index in enumerate(dip_mouse.index):
    
    progressPercent = ((i+1)/len(dip_mouse.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(dip_mouse.index)))
    sys.stdout.flush()
    
    gene = dip_mouse.ix[index, 'Protein B (gene name)']
    if gene in mapping[3].values:
        number = mapping[mapping[3] == gene].index
        if (number[0][0], 9606) in mapping.index:
            if type(mapping.ix[(number[0][0], 9606), 3]) == str:
                lst.append(mapping.ix[(number[0][0], 9606), 3])
            else:
                lst.append(np.nan)  # insert nan if gene maps tp more than one human gene  
        else:
            lst.append(np.nan)  # insert nan if gene doesnt map to human gene       
    else:
        lst.append(np.nan) # insert nan if gene not found in the HOMOLOGENE

dip_mouse['Protein B (gene name)'] = lst

In [None]:
dip_mouse.shape

In [None]:
dip_mouse.dropna(how='any', inplace=True)

In [None]:
lst = []
for index in dip.index:
    lst.append(tuple(sorted(tuple((dip.ix[index,'Protein A (gene name)'], dip.ix[index, 'Protein B (gene name)'])))))
dip['ppi'] = lst

for i,ppi in enumerate(dip['ppi'].unique()):
    
    progressPercent = ((i+1)/len(dip['ppi'].unique()))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(dip['ppi'].unique())))
    sys.stdout.flush()
    
    ref = ('|').join(dip.ix[dip[dip['ppi'] == ppi].index, 'PubMed ID'])
    dip.ix[dip[dip['ppi'] == ppi].index[0], 'PubMed ID'] = ref
    dip.drop(dip[dip['ppi'] == ppi].index[1:], inplace=True)

In [None]:
dip_sig_df = pd.DataFrame(columns=sig_col)

dip_sig_df['Source Name'] = dip['Protein A (gene name)']

dip_sig_df['Target Name'] = dip['Protein B (gene name)']

dip_sig_df['PubMed IDs'] = dip['PubMed ID']

dip_sig_df.replace(np.nan, 'NA', inplace=True)

dip_ppiSIG = path+'dip_ppi_%s.sig'% str(datetime.date.today()).replace('-', '_')
dip_sig_df.to_csv(dip_ppiSIG, index=None, header=None, sep='\t')

# Convert SIG File to GMT File

In [None]:
#Define function 'sig_to_gmt' that converts a SIG file to a GMT file
def sig_to_gmt(sig_df):
    #Create two versions of df 'd1' --> one flipped and one which is in original order
    d1 = {'protein_1': sig_df[0], 'protein_2': sig_df[5]}
    df = pd.DataFrame(data = d1)
    df.drop_duplicates(inplace = True)

    d2 = {'protein_1': sig_df[5], 'protein_2': sig_df[0]}
    df_flipped = pd.DataFrame(data = d2)
    df_flipped.drop_duplicates(inplace = True)

    #Make 'df' and 'df_flipped' into a single frame to account for opposite interactions
    extend = pd.concat([df, df_flipped])

    #Drop any duplicates and set index according to protein 1 so that we can aggregate 
    #all interacting protein 2's by a shared interaction with protein one
    extend.drop_duplicates(inplace = True)
    extend.set_index('protein_1', inplace = True)

    gmt = extend.groupby('protein_1').agg(lambda x: tuple(x))

    # Create column representing counts of protein interactions per protein
    gmt['interactions'] = [int(len(lst)) for protein, lst in gmt['protein_2'].iteritems()]

    # Sort proteins from max to min according to number of protein iteractions
    gmt.sort_values(by = ['interactions'], ascending= False, inplace=True)

    #Gain indexes of gene sets with 5 or more proteins and less than 2000 protein interactions
    indices = [index for index, rowData in gmt.interactions.iteritems() if rowData >= 5 and rowData < 2000]

    #filter dataframe by these indices
    gmt = gmt.loc[indices]

    #reset index and insert a column for the description
    gmt.insert(0, 'Description', 'No Description')
    gmt.reset_index(inplace = True)

    #Drop columns not needed in GMT and join all protein interactions by a tab
    gmt['merged'] = ['\t'.join(x) for x in gmt['protein_2']]
    gmt.drop('protein_2', axis = 1, inplace = True)
    gmt.drop('interactions', axis = 1, inplace = True)

    #create a dictionary and store in it rowData corresponding to each protein
    gmt_d = dict([(key, '') for key in gmt.index])

    # loop through rows with iterrows()
    for index, rowData in gmt.iterrows():
        line = ('\t'.join(rowData))
        gmt_d[index] = line
    return gmt_d

In [None]:
#Use function to obtain dictionary of the data
dip_dict = sig_to_gmt(dip_sig_df)

#Transfer tab-separated info into a new gmt file
with open('dip_ppi.gmt', 'w') as openfile:
    for index in dip_dict:
        openfile.write(str(dip_dict[index]) + '\n')