# DIP

Author: Moshe C. Silverstein  
Date: 07-2017

In [1]:
import sys
import re
import numpy as np
import pandas as pd
import Utilities
from Utilities import *
from importlib import reload

# Load Data

Data Retrieved 1/25/2017

http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/dip20160731.txt (DIP)

In [2]:
dip_df = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Input/dip20160731.txt', sep='\t', index_col=False)

# Load Gene Mapping File

In [3]:
mappingFile = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Doc and Mapping/mappingFile_2017.txt', sep='\t', header=None, index_col=0)

In [4]:
# To map uniprot to gene symbol
mappingUniP_df = pd.read_csv('/Users/moshesilverstein/Documents/Harmonizome/PPI Library/Doc and Mapping/mapUniProt.txt', sep='\t')

In [5]:
mappingUniP_df.drop_duplicates(subset='UniProt ID(supplied by UniProt)', inplace=True, keep=False)

In [6]:
mappingUniP_df.set_index('UniProt ID(supplied by UniProt)', inplace=True)

In [7]:
mappingUniP_df.head()

Unnamed: 0_level_0,Approved Symbol
UniProt ID(supplied by UniProt),Unnamed: 1_level_1
P04217,A1BG
Q9NQ94,A1CF
P01023,A2M
A8K2U0,A2ML1
U3KPV4,A3GALT2


# Select Relevent Data

In [8]:
# get only human and mouse data
n = dip_df['Taxid interactor A'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item or "Mus musculus" in item]
dip_df = dip_df.ix[b]

dip_df =  dip_df.reset_index().drop('index', axis=1)

n = dip_df['Taxid interactor B'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item or "Mus musculus" in item]
dip_df = dip_df.ix[b]

dip_df =  dip_df.reset_index().drop('index', axis=1)

In [9]:
dip_df = dip_df[['ID interactor A', 'ID interactor B', 'Publication Identifier(s)', 'Source database(s)']]

In [10]:
dip_df.rename(columns={'ID interactor A': 'Protein A (gene name)', 'ID interactor B':'Protein B (gene name)', 'Publication Identifier(s)':'PubMed ID', 'Source database(s)':'Source databases'}, inplace=True)

In [11]:
dip_df.ix[696, 'Protein B (gene name)']

'DIP-6020N|refseq:NP_076918|uniprotkb:P01562'

In [12]:
# change columns to just show gene name
lstA = []
lstB = []
for i,index in enumerate(dip_df.index):
    
    progressPercent = ((i+1)/len(dip_df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(dip_df.index)))
    sys.stdout.flush()
    
    listA = dip_df.ix[index, 'Protein A (gene name)'].split('|')
    
    for element in listA:
        if 'uniprotkb' in element or 'uniprot' in element:
            if element.split(':')[1] in mappingUniP_df.index:
                lstA.append(mappingUniP_df.ix[element.split(':')[1], 'Approved Symbol'])
                break
            else:
                lstA.append(np.nan)
                break          
        elif element == listA[-1]:
            lstA.append(np.nan)
            break
            

        
    listB = dip_df.ix[index, 'Protein B (gene name)'].split('|')
    
    for element in listB:
        if 'uniprotkb' in element or 'uniprot' in element:
            if element.split(':')[1] in mappingUniP_df.index:
                lstB.append( mappingUniP_df.ix[element.split(':')[1], 'Approved Symbol'])
                break
            else:
                lstB.append(np.nan)
                break
        elif element == listB[-1]:
            lstB.append(np.nan)
            break

            
dip_df['Protein A (gene name)'] = lstA
dip_df['Protein B (gene name)'] = lstB
dip_df.dropna(inplace=True)

Progress: 100%  7800 Out of 7800   

In [13]:
dip_df.shape

(5018, 4)

In [14]:
dip_df = dip_df.reset_index().drop('index', axis=1)

# Map Gene to human and updated approved symbols

In [15]:
dip_df.ix[482:484]

Unnamed: 0,Protein A (gene name),Protein B (gene name),PubMed ID,Source databases
482,IFNA5,IFNAR2,pubmed:10395669|pubmed:DIP-1571S,MI:0465(dip)
483,IFNAR2,IFNA8,pubmed:10395669|pubmed:DIP-1571S,MI:0465(dip)
484,IFNB1,IFNAR2,pubmed:10395669|pubmed:DIP-1571S,MI:0465(dip)


In [16]:
mapgenesymbols(dip_df, mappingFile, 'Protein A (gene name)', 'Protein B (gene name)')

Progeres: 100%  5018 Out of 5018   

# Get PubMed Ids

In [17]:
# Get Only PubMed ID for publication identifier
getPubMedIds(dip_df, 'PubMed ID')

# Unfiltered

In [18]:
# uf = unfiltered
dip_uf_df = dip_df.copy()

### Combine duplicate ppis while concatanatig referances

In [19]:
combineDupPPIs(dip_uf_df)

Progress: 100%  4999 Out of 4999   

### Create .sig File

In [20]:
pathU = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Unfiltered/'

createSigFile(dip_uf_df, pathU, 'dip', False)

# Filtered

In [21]:
dip_f_df = dip_df.copy()

In [22]:
# drop any data that was published with more then 10 PPI's per publication or doesnt have a ppi
filterPPIbyPubmed(dip_f_df, 10)

Progress: 103%  1971 Out of 1912   

### Combine duplicate ppis while concatanatig referances

In [23]:
combineDupPPIs(dip_f_df)

Progress: 100%  3822 Out of 3822   

### Create .sig File

In [24]:
pathF = '~/./Documents/Harmonizome/PPI Library/Output/IndividualResources/Filtered/'

createSigFile(dip_f_df, pathF, 'dip', True)