Import necessary modules

In [1]:
import numpy as np 
import pandas as pd
import urllib.request 
import requests, io
import urllib, re, string
import datetime

Include for nicer format of tables when printing

In [7]:
pd.set_option('display.notebook_repr_html', True)

def _repr_latex_(self):
    return "\centering{%s}" % self.to_latex()

pd.DataFrame._repr_latex_ = _repr_latex_  # monkey patch pandas DataFrame

### Data Source and Date of Retrieval

Data Retrieved 1/23/2017  

http://mint.bio.uniroma2.it/mitab/MINT_MiTab.txt (Mint) 

Data Retrieved 1/25/2017  

https://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.144/BIOGRID-ALL-3.4.144.mitab.zip (BioGrid)  
http://www.innatedb.com/download/interactions/innatedb_ppi.mitab.gz (InnateDB)  
http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/dip20160731.txt (DIP)

Data Retrieved 1/26/2017   
http://mentha.uniroma2.it/doDownload.php?file=organisms/9606.zip (mentha)  
http://www.ebi.ac.uk/intact/downloads (IntAct)  

Data Retrieved 2/1/2017     
http://amp.pharm.mssm.edu/X2K/datasets/SNAVI.sig (SAVI)  
http://amp.pharm.mssm.edu/X2K/datasets/KEA.zip (KEA)  
http://amp.pharm.mssm.edu/X2K/datasets/ppid.sig (ppid)  

In [8]:
# column identifiers for initial data (as taken from respective metadata)
col_name = ['Unique identifier for interactor A',
      'Unique identifier for interactor B',
      'Alternative identifier for interactor A',
      'Alternative identifier for interactor B',
      'Aliases for A',
      'Aliases for B',
      'Interaction detection methods',
      'First author',
      'Identifier of the publication',
      'NCBI Taxonomy identifier for interactor A',
      'NCBI Taxonomy identifier for interactor B',
      'Interaction types',
      'Source databases', 
      'Interaction identifier(s)',
      'Confidence score']

#### Read mint data from url and creat file

In [9]:
##read in data from Mint and write to file (only needs to be proformed once)
# content=urllib.request.urlopen('http://mint.bio.uniroma2.it/mitab/MINT_MiTab.txt') 

# target = open('mint.tsv', 'wb')
# for line in content:
#     target.write(line)
# target.close()

## Mint Data

In [13]:
mint_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/mint.tsv', sep='\t', index_col=False, header=None)

In [11]:
# assign names to columns from meta data
for i in np.arange(len(col_name)):
    mint_df.rename(columns={i:col_name[i]}, inplace=True)

In [12]:
# get only human (or mouse) data (first protein)
n = mint_df['Aliases for A'].values
b = [i for i,item in enumerate(n) if "human" in item]
mint_df = mint_df.ix[b]

In [13]:
mint_df =  mint_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data (second protein)
n = mint_df['Aliases for B'].values
b = [i for i,item in enumerate(n) if "human" in item]
mint_df = mint_df.ix[b]

In [14]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
lst = []
for pub in mint_df['Identifier of the publication'].values:
    print('\r', str(i)+' Out of '+ str(len(mint_df['Identifier of the publication'].values)), end=' ', flush=True)
    lst.append(mint_df[mint_df['Identifier of the publication'] == pub].shape[0] <= 10)
    i +=1

mint_df = mint_df[lst]

 26694 Out of 26694 

In [15]:
# select only relevent data
mint_df = mint_df[['Aliases for A', 'Aliases for B', 'Identifier of the publication', 'Source databases']]

In [16]:
# change column one to just show gene name
#i = 1
lst = []
for name in mint_df['Aliases for A']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Aliases for A'] = lst

In [17]:
# change column two to just show gene name
#i = 1
lst = []
for name in mint_df['Aliases for B']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
mint_df['Aliases for B'] = lst

In [18]:
# drop data for which no gene name is provited (or ensamble ID)
mint_df.dropna(how='any', inplace=True, axis=0)

In [19]:
mint_df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [20]:
# Get Only PubMed ID for publication identifier
lst = []

for pub in mint_df['Identifier of the publication']:
    pub = str(pub)
    pub = pub.split('|')[1].split(':')[1]
    lst.append(pub)
mint_df['Identifier of the publication'] = lst

In [21]:
# Drop data for which there is no PubMed ID
lst = []
for value in mint_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

mint_df = mint_df.drop(mint_df[lst].index)

In [22]:
# Drop any data missing information
mint_df.dropna(inplace=True)

In [63]:
mint_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/mint_human.csv', sep='\t')

## BioGrid Data

In [23]:
biogrid_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/BIOGRID-ALL-3.4.144.mitab.txt', sep='\t', index_col=False)

In [24]:
# get only human (or mouse) data
n = biogrid_df['Taxid Interactor A'].values
b = [i for i,item in enumerate(n) if "taxid:9606" in item] #taxid:9606 for human, taxid:10090 for mouse
biogrid_df = biogrid_df.ix[b]

In [25]:
biogrid_df =  biogrid_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = biogrid_df['Taxid Interactor B'].values
b = [i for i,item in enumerate(n) if "taxid:9606" in item] #taxid:9606 for human, taxid:10090 for mouse
biogrid_df = biogrid_df.ix[b]

In [26]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in biogrid_df['Publication Identifiers'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(biogrid_df['Publication Identifiers'].unique())), end=' ', flush=True)
    if biogrid_df[biogrid_df['Publication Identifiers'] == pub].shape[0] > 10:
        biogrid_df.drop(biogrid_df[biogrid_df['Publication Identifiers'] == pub].index.tolist(), inplace=True)
    i +=1


 24500 Out of 22528 

In [27]:
biogrid_df = biogrid_df[['Alt IDs Interactor A', 'Alt IDs Interactor B', 'Publication Identifiers', 'Source Database']]

In [28]:
biogrid_df.rename(columns={'Alt IDs Interactor A': 'Aliases for A', 'Alt IDs Interactor B':'Aliases for B', 'Publication Identifiers':'Identifier of the publication', 'Source Database':'Source databases'}, inplace=True)

In [29]:
# change column one to just show gene name
i = 1
lst = []
for name in biogrid_df['Aliases for A']:
    if i % 100 == 0:
        print('\r', i, end=' ', flush=True)
    name = str(name) 
    if name != 'nan':
        lst.append(name.split('|')[1].split(':')[1])
    else:
        lst.append(np.nan)
    i += 1
biogrid_df['Aliases for A'] = lst

 70700 

In [30]:
# change column two to just show gene name
i = 1
lst = []
for name in biogrid_df['Aliases for B']:
    if i % 100 == 0:
        print('\r', i, end=' ', flush=True)
    name = str(name) 
    if name != 'nan':
        lst.append(name.split('|')[1].split(':')[1])
    else:
        lst.append(np.nan)
    i += 1
biogrid_df['Aliases for B'] = lst

 70700 

In [31]:
# drop data for which no gene name is provited (or ensamble ID)
biogrid_df.dropna(how='any', inplace=True, axis=0)

In [32]:
biogrid_df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [33]:
lst = []

for pub in biogrid_df['Identifier of the publication']:
    pub = str(pub)
    pub = pub.split(':')[1]
    lst.append(pub)
biogrid_df['Identifier of the publication'] = lst

In [34]:
biogrid_df.shape

(43586, 4)

In [35]:
lst = []
for value in biogrid_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

biogrid_df = biogrid_df.drop(biogrid_df[lst].index)

In [36]:
biogrid_df.dropna(inplace=True)

In [64]:
biogrid_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/biogrid_human.csv', sep='\t')

## InnateDB Data

In [37]:
innatedb_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/innatedb_ppi.mitab', sep='\t', index_col=False)

In [38]:
# get only human (or mouse) data
n = innatedb_df['ncbi_taxid_A'].values
b = [i for i,item in enumerate(n) if "Human" in item]
innatedb_df = innatedb_df.ix[b]

In [39]:
innatedb_df =  innatedb_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = innatedb_df['ncbi_taxid_B'].values
b = [i for i,item in enumerate(n) if "Human" in item]
innatedb_df = innatedb_df.ix[b]

In [40]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in innatedb_df['pmid'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(innatedb_df['pmid'].unique())), end=' ', flush=True)
    if innatedb_df[innatedb_df['pmid'] == pub].shape[0] > 10:
        innatedb_df.drop(innatedb_df[innatedb_df['pmid'] == pub].index.tolist(), inplace=True)
    i +=1


 3000 Out of 2879 

In [41]:
innatedb_df = innatedb_df[['alias_A', 'alias_B', 'pmid', 'source_database']]

In [42]:
innatedb_df.rename(columns={'alias_A': 'Aliases for A', 'alias_B':'Aliases for B', 'pmid':'Identifier of the publication', 'source_database':'Source databases'}, inplace=True)

In [43]:
# change column one to just show gene name
#i = 1
lst = []
for name in innatedb_df['Aliases for A']:
    #print('\r', i, end=' ', flush=True)
    name = str(name) 
    if name != 'nan':
        lst.append(name.split('|')[-1].split(":")[1].split('(')[0])
    else:
        lst.append(np.nan)
    #i += 1
innatedb_df['Aliases for A'] = lst

In [44]:
# change column two to just show gene name
#i = 1
lst = []
for name in innatedb_df['Aliases for B']:
    #print('\r', i, end=' ', flush=True)
    name = str(name) 
    if name != 'nan':
        lst.append(name.split('|')[-1].split(":")[1].split('(')[0])
    else:
        lst.append(np.nan)
    #i += 1
innatedb_df['Aliases for B'] = lst

In [45]:
# drop data for which no gene name is provited (or ensamble ID)
innatedb_df.dropna(how='any', inplace=True, axis=0)

In [46]:
innatedb_df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [47]:
lst = []

for pub in innatedb_df['Identifier of the publication']:
    pub = str(pub)
    pub = pub.split(':')[1]
    lst.append(pub)
innatedb_df['Identifier of the publication'] = lst

In [48]:
innatedb_df.shape

(4754, 4)

In [49]:
lst = []
for value in innatedb_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

innatedb_df = innatedb_df.drop(innatedb_df[lst].index)

In [50]:
innatedb_df.dropna(inplace=True)

In [65]:
innatedb_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/innatedb_human.csv', sep='\t')

## DIP DATA

In [66]:
dip_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/dip20160731.txt', sep='\t', index_col=False)

In [67]:
# get only human (or mouse) data
n = dip_df['Taxid interactor A'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item]
dip_df = dip_df.ix[b]

In [68]:
dip_df =  dip_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = dip_df['Taxid interactor B'].values
b = [i for i,item in enumerate(n) if "Homo sapiens" in item]
dip_df = dip_df.ix[b]

In [69]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in dip_df['Publication Identifier(s)'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(dip_df['Publication Identifier(s)'].unique())), end=' ', flush=True)
    if dip_df[dip_df['Publication Identifier(s)'] == pub].shape[0] > 10:
        dip_df.drop(dip_df[dip_df['Publication Identifier(s)'] == pub].index.tolist(), inplace=True)
    i +=1

 3300 Out of 3281 

In [70]:
dip_df = dip_df[['ID interactor A', 'ID interactor B', 'Publication Identifier(s)', 'Source database(s)']]

In [71]:
dip_df.rename(columns={'ID interactor A': 'Aliases for A', 'ID interactor B':'Aliases for B', 'Publication Identifier(s)':'Identifier of the publication', 'Source database(s)':'Source databases'}, inplace=True)

#### mapping table to convert labels from uniprot to ncbi names

In [72]:
mapping_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Doc and Mapping/HUMAN_9606_idmapping.dat', sep='\t', index_col=False, names=['UniProtKB-AC', 'ID_type', 'ID'])

In [73]:
mapping_df = mapping_df[mapping_df['ID_type']=='Gene_Name']

In [74]:
mapping_df.set_index('UniProtKB-AC', inplace=True)

In [75]:
mapping_df = mapping_df[~mapping_df.index.duplicated(keep='first')]

In [76]:
# change column one to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Aliases for A'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Aliases for A'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Aliases for A'] = np.nan
    else:
        dip_df.ix[index, 'Aliases for A'] = np.nan

In [77]:
# change column two to just show gene name
for index in dip_df.index:
    found = re.search('uniprotkb:......', dip_df.ix[index, 'Aliases for B'])
    if found:
        name = found.group(0).split(':')[1]
        if name in mapping_df.index.values:
            dip_df.ix[index, 'Aliases for B'] = mapping_df.ix[name, 'ID']
        else:
            dip_df.ix[index, 'Aliases for B'] = np.nan
    else:
        dip_df.ix[index, 'Aliases for B'] = np.nan

In [78]:
dip_df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [79]:
lst = []

for pub in dip_df['Identifier of the publication']:
    pub = str(pub)
    pub = pub.split('|')[0].split(':')[1]
    lst.append(pub)
dip_df['Identifier of the publication'] = lst

In [80]:
lst = []
for value in dip_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

dip_df = dip_df.drop(dip_df[lst].index)

In [81]:
dip_df.dropna(inplace=True)

In [82]:
mapping_df.head()

Unnamed: 0_level_0,ID_type,ID
UniProtKB-AC,Unnamed: 1_level_1,Unnamed: 2_level_1
P31946,Gene_Name,YWHAB
P62258,Gene_Name,YWHAE
Q04917,Gene_Name,YWHAH
P61981,Gene_Name,YWHAG
P31947,Gene_Name,SFN


In [83]:
dip_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/dip_human.csv', sep='\t')

## mentha data

In [84]:
mentha_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/mentha_human', sep=';')

In [85]:
mentha_df = mentha_df[['Gene A', 'Gene B', 'PMID']]

In [86]:
mentha_df['Source databases'] = '(mentha)'

In [87]:
mentha_df.rename(columns={'Gene A': 'Aliases for A', 'Gene B':'Aliases for B', 'PMID':'Identifier of the publication'}, inplace=True)

In [88]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in mentha_df['Identifier of the publication'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(mentha_df['Identifier of the publication'].unique())), end=' ', flush=True)
    if mentha_df[mentha_df['Identifier of the publication'] == pub].shape[0] > 10:
        mentha_df.drop(mentha_df[mentha_df['Identifier of the publication'] == pub].index.tolist(), inplace=True)
    i +=1

 32800 Out of 31989 

In [89]:
lst = []
for value in mentha_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

mentha_df = mentha_df.drop(mentha_df[lst].index)

In [90]:
i = 1
for index in mentha_df.index:
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(mentha_df.index)), end=' ', flush=True)
    mentha_df.ix[index, 'Identifier of the publication'] = '|'.join(mentha_df.ix[index, 'Identifier of the publication'].split(' ')[0:-1])
    i +=1

 49300 Out of 49300 

In [91]:
mentha_df.dropna(inplace=True)

In [92]:
mentha_df.shape

(49300, 4)

In [93]:
mentha_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
31,MYO10,TUBB2B,21642953,(mentha)
34,HDAC6,TUBB2B,12606581,(mentha)
36,ABCB5,PSMB9,15488952,(mentha)
38,DSCC1,CHTF8,12766176|26186194,(mentha)
39,DSCC1,CHTF8,12766176|26186194,(mentha)


In [94]:
mentha_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/mentha_human.csv', sep='\t')

## IntAct Data 

In [95]:
intact_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/intact.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
#intact_negative_df = pd.read_csv('intact_negative.txt', sep='\t')

In [None]:
#intact_df = pd.concat([intact_df, intact_negative_df])

In [96]:
# get only human (or mouse) data
n = intact_df['Taxid interactor A'].values
b = [i for i,item in enumerate(n) if "human" in item]
intact_df = intact_df.ix[b]

In [97]:
intact_df =  intact_df.reset_index().drop('index', axis=1)

# get only human (or mouse) data
n = intact_df['Taxid interactor B'].values
b = [i for i,item in enumerate(n) if "human" in item]
intact_df = intact_df.ix[b]

In [None]:
#intact_df.ix[0, 'Taxid interactor B']

In [98]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in intact_df['Publication Identifier(s)'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(intact_df['Publication Identifier(s)'].unique())), end=' ', flush=True)
    if intact_df[intact_df['Publication Identifier(s)'] == pub].shape[0] > 10:
        intact_df.drop(intact_df[intact_df['Publication Identifier(s)'] == pub].index.tolist(), inplace=True)
    i +=1


 6800 Out of 5462 

In [99]:
intact_df = intact_df[['Alias(es) interactor A', 'Alias(es) interactor B', 'Publication Identifier(s)', 'Source database(s)']]

In [100]:
intact_df.rename(columns={'Alias(es) interactor A': 'Aliases for A', 'Alias(es) interactor B':'Aliases for B', 'Publication Identifier(s)':'Identifier of the publication', 'Source database(s)':'Source databases'}, inplace=True)

In [101]:
# change column one to just show gene name
#i = 1
lst = []
for name in intact_df['Aliases for A']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
intact_df['Aliases for A'] = lst

In [102]:
# change column two to just show gene name
#i = 1
lst = []
for name in intact_df['Aliases for B']:
    #print('\r', i, end=' ', flush=True)
    name = str(name)
    if '(gene name)' in name:
        temp = name.split('|')
        for ID in temp:
            if '(gene name)' in ID:
                lst.append(ID.split(':')[1].split('(')[0])
                break

    else:
        lst.append(np.nan)
    
    #i += 1
intact_df['Aliases for B'] = lst

In [103]:
# drop data for which no gene name is provited (or ensamble ID)
intact_df.dropna(how='any', inplace=True, axis=0)

In [104]:
intact_df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [105]:
lst = []

for pub in intact_df['Identifier of the publication']:
    pub = str(pub)
    pub = pub.split('|')
    for p in pub:
        if 'pubmed' in p:
            pub = p.split(':')[1]
    lst.append(pub)
intact_df['Identifier of the publication'] = lst

In [106]:
intact_df.shape

(10998, 4)

In [107]:
lst = []
for value in intact_df['Identifier of the publication']:
    if 'unassigned' in value:
        lst.append(True)
    else:
        lst.append(False)

intact_df = intact_df.drop(intact_df[lst].index)

In [108]:
intact_df.dropna(inplace=True)

In [109]:
intact_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
22,FHL3,FHL2,11135358,"psi-mi:""MI:0471""(MINT)"
24,EPS15,EPS15,9182572,"psi-mi:""MI:0471""(MINT)"
27,MCM7,FHL2,10649446,"psi-mi:""MI:0471""(MINT)"
28,AR,FHL2,10654935,"psi-mi:""MI:0471""(MINT)"
30,SOS1,ABI1,10499589,"psi-mi:""MI:0471""(MINT)"


In [None]:
#intact_negative_df.head()

In [None]:
# Alt. ID(s) interactor A
#intact_df.ix[22, 'Aliases for A']

In [110]:
intact_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/intact_human.csv', sep='\t')

## SAVI Data

In [111]:
savi_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/SNAVI.sig', sep=' ', header=None)

In [112]:
savi_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,SYNGAP1,Q96PV0,,GEF,Cytosol,RAP1A,P62834,P62835,GTPase,Cytosol,_,GAP,15312654
1,GSK3A,P49840,,Kinase,Cytosol,JUN,P05412,P05627,TF,Nucleus,_,Phosphorylation,16023596
2,GSK3A,P49840,,Kinase,Cytosol,JUN,P05412,P05627,TF,Nucleus,_,Phosphorylation,1846781
3,RIPK1,Q13546,Q60855,Kinase,Cytosol,SQSTM1,Q13501,Q64337,Adapter,Cytosol,+,Binding,10356400
4,CAMK2A,Q9UQM7,P11798,Kinase,Cytosol,MAP2,P11137,P20357,Cytoskeleton,Cytosol,+,Phosphorylation,7890745


In [113]:
savi_df.shape

(2007, 13)

In [114]:
savi_df = savi_df[[0,5,12]]
savi_df.rename(columns={0:'Aliases for A',5:'Aliases for B',12:'Identifier of the publication'}, inplace=True)
savi_df['Source databases'] = '(SAVI)'

In [115]:
savi_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,SYNGAP1,RAP1A,15312654,(SAVI)
1,GSK3A,JUN,16023596,(SAVI)
2,GSK3A,JUN,1846781,(SAVI)
3,RIPK1,SQSTM1,10356400,(SAVI)
4,CAMK2A,MAP2,7890745,(SAVI)


In [116]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in savi_df['Identifier of the publication'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(savi_df['Identifier of the publication'].unique())), end=' ', flush=True)
    if savi_df[savi_df['Identifier of the publication'] == pub].shape[0] > 10:
        savi_df.drop(savi_df[savi_df['Identifier of the publication'] == pub].index.tolist(), inplace=True)
    i +=1


 1200 Out of 1293 

In [117]:
savi_df.shape

(1948, 4)

In [118]:
savi_df.reset_index(inplace=True)
savi_df.drop('index', axis=1, inplace=True)
oldSize = savi_df.shape
newSize = 0
while oldSize != newSize:
    savi_df.reset_index(inplace=True)
    savi_df.drop('index', axis=1, inplace=True)
    # concattonate like terms and remove duplicates
    i = 1
    end = len(savi_df.index)-1
    for index in savi_df.index:
        if i % 100 == 0:
            print('\r', str(i)+' Out of '+ str(len(savi_df.index)), end=' ', flush=True)
        if index in savi_df.index.values and index != end:
            if savi_df.ix[index, 'Aliases for A'] == savi_df.ix[index+1, 'Aliases for A']:
                if savi_df.ix[index, 'Aliases for B'] == savi_df.ix[index+1, 'Aliases for B']:
                    savi_df.ix[index, 'Identifier of the publication'] = str(savi_df.ix[index, 'Identifier of the publication']) +'|'+str(savi_df.ix[index+1, 'Identifier of the publication'])
                    savi_df.drop(index+1, axis=0, inplace=True)
        i += 1
    oldSize = newSize
    newSize = savi_df.shape

 1300 Out of 1378 

In [119]:
savi_df.shape

(1378, 4)

In [120]:
savi_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,SYNGAP1,RAP1A,15312654,(SAVI)
1,GSK3A,JUN,16023596|1846781,(SAVI)
2,RIPK1,SQSTM1,10356400,(SAVI)
3,CAMK2A,MAP2,7890745|9341200|2561875,(SAVI)
4,ARRB1,ADRB2,11171997,(SAVI)


In [121]:
savi_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/savi_human.csv', sep='\t')

## KEA Data

In [40]:
kea_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/KEA/kinase-protein_interactions.csv', sep=',', header=None)

In [41]:
kea_df.head()

Unnamed: 0,0,1,2,3,4,5
0,TK,ABL,ABL1,ABL1,10964922;11781820;12522270;15174125;17192257;1...,HPRD;PHOSPHOELM;PHOSPHOPOINT;PHOSPHOSITE;SWISS...
1,TK,ABL,ABL1,ABL2,12569093;15174125;18689816,HPRD;PHOSPHOPOINT;PHOSPHOSITE
2,TK,ABL,ABL1,BCR,10405761;11780146;12124177;12522270;15174125;1...,HPRD;PHOSPHOELM;PHOSPHOPOINT;PHOSPHOSITE
3,TK,ABL,ABL1,BTK,11598012;12445832;12573241;15174125;18689816;8...,HPRD;PHOSPHOELM;PHOSPHOPOINT;PHOSPHOSITE
4,TK,ABL,ABL1,CDK5,10896159;11113134;14757045;15174125;18689816;1...,HPRD;PHOSPHOELM;PHOSPHOPOINT;PHOSPHOSITE;SAVI;...


In [42]:
kea_ph_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/Input/KEA/phosphorylation_reactions.csv', sep=',', header=None)

In [43]:
kea_ph_df.head()

Unnamed: 0,0,1,2,3,4,5
0,TK,ABL,ABL1,ABI1,11418237;12672821;15174125;17101133;18689816;8...,HPRD;MINT;PHOSPHOPOINT;PHOSPHOSITE
1,TK,ABL,ABL1,ABL1,10964922;11781820;12522270;15174125;17192257;1...,HPRD;PHOSPHOELM;PHOSPHOPOINT;PHOSPHOSITE;SWISS...
2,TK,ABL,ABL1,ABL2,12569093;15174125;18689816,HPRD;PHOSPHOPOINT;PHOSPHOSITE
3,TK,ABL,ABL1,ACAP2,17981841,NETWORKIN
4,TK,ABL,ABL1,ANXA1,15174125;2457390,PHOSPHOELM;PHOSPHOSITE


In [44]:
kea_df = pd.concat([kea_df, kea_ph_df])

In [45]:
kea_df.shape

(22478, 6)

In [46]:
kea_df.drop_duplicates(inplace=True)

In [47]:
kea_df.shape

(17401, 6)

In [48]:
kea_df.reset_index(inplace=True)
kea_df.drop('index', axis=1, inplace=True)

In [49]:
#i = 1
for index in kea_df.index:
    #print('\r', str(i)+' Out of '+ str(len(kea_df.index)), end=' ', flush=True)
    kea_df.ix[index, 4] = kea_df.ix[index,4].replace(';', '|')
    kea_df.ix[index, 5] = kea_df.ix[index,5].replace(';', '|')
    #i+=1

In [50]:
kea_df = kea_df[[2,3,4,5]]

In [51]:
kea_df.rename(columns={2: 'Aliases for A', 3:'Aliases for B', 4:'Identifier of the publication', 5:'Source databases'}, inplace=True)

In [19]:
#kea_df.ix[0, 5] = kea_df.ix[0,5].replace(';', '|')

In [53]:
# drop any data that was published with more then 10 PPI's per publication
i = 1
for pub in kea_df['Identifier of the publication'].unique():
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(kea_df['Identifier of the publication'].unique())), end=' ', flush=True)
    if kea_df[kea_df['Identifier of the publication'] == pub].shape[0] > 10:
        kea_df.drop(kea_df[kea_df['Identifier of the publication'] == pub].index.tolist(), inplace=True)
    i +=1


 6400 Out of 6365 

In [54]:
kea_df.shape

(8060, 4)

In [60]:
kea_df.to_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/kea_human.csv', sep='\t')

### Data Tables

In [122]:
mint_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,FBXO5,SKP1,17380122,"psi-mi:""MI:0471""(MINT)"
12,SLC6A3,DRD2,17380124,"psi-mi:""MI:0471""(MINT)"
53,TCF4,EP300,17410209,"psi-mi:""MI:0471""(MINT)"
54,RUNX1,CDK6,17431401,"psi-mi:""MI:0471""(MINT)"
55,CBFB,RUNX1,17431401,"psi-mi:""MI:0471""(MINT)"


In [123]:
biogrid_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,MAP2K4,FLNC,9006895,"psi-mi:""MI:0463""(biogrid)"
1,MYPN,ACTN2,11309420,"psi-mi:""MI:0463""(biogrid)"
2,ACVR1,FNTA,8599089,"psi-mi:""MI:0463""(biogrid)"
3,GATA2,PML,10938104,"psi-mi:""MI:0463""(biogrid)"
4,RPA2,STAT3,10875894,"psi-mi:""MI:0463""(biogrid)"


In [124]:
innatedb_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,LY96,TLR4,10359581,MI:0974(innatedb)
1,MYD88,TLR4,17228323,MI:0974(innatedb)
2,MYD88,IRAK4,12860405,MI:0974(innatedb)
7,IRAK4,IRAK1,12860405,MI:0974(innatedb)
11,PELI2,IRAK4,12860405,MI:0974(innatedb)


In [125]:
dip_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,CD4,CD4,9168119,MI:0465(dip)
2,RB1,KDM5A,8414517,MI:0465(dip)
3,POU2F2,TBP,8202368,MI:0465(dip)
4,CRK,CBL,9178909,MI:0465(dip)
5,INSR,PIK3R1,7589433,MI:0465(dip)


In [126]:
mentha_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
31,MYO10,TUBB2B,21642953,(mentha)
34,HDAC6,TUBB2B,12606581,(mentha)
36,ABCB5,PSMB9,15488952,(mentha)
38,DSCC1,CHTF8,12766176|26186194,(mentha)
39,DSCC1,CHTF8,12766176|26186194,(mentha)


In [127]:
intact_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
22,FHL3,FHL2,11135358,"psi-mi:""MI:0471""(MINT)"
24,EPS15,EPS15,9182572,"psi-mi:""MI:0471""(MINT)"
27,MCM7,FHL2,10649446,"psi-mi:""MI:0471""(MINT)"
28,AR,FHL2,10654935,"psi-mi:""MI:0471""(MINT)"
30,SOS1,ABI1,10499589,"psi-mi:""MI:0471""(MINT)"


In [128]:
savi_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,SYNGAP1,RAP1A,15312654,(SAVI)
1,GSK3A,JUN,16023596|1846781,(SAVI)
2,RIPK1,SQSTM1,10356400,(SAVI)
3,CAMK2A,MAP2,7890745|9341200|2561875,(SAVI)
4,ARRB1,ADRB2,11171997,(SAVI)


In [56]:
kea_df.head()

Unnamed: 0,Aliases for A,Aliases for B,Identifier of the publication,Source databases
0,ABL1,ABL1,10964922|11781820|12522270|15174125|17192257|1...,HPRD|PHOSPHOELM|PHOSPHOPOINT|PHOSPHOSITE|SWISS...
1,ABL1,ABL2,12569093|15174125|18689816,HPRD|PHOSPHOPOINT|PHOSPHOSITE
2,ABL1,BCR,10405761|11780146|12124177|12522270|15174125|1...,HPRD|PHOSPHOELM|PHOSPHOPOINT|PHOSPHOSITE
3,ABL1,BTK,11598012|12445832|12573241|15174125|18689816|8...,HPRD|PHOSPHOELM|PHOSPHOPOINT|PHOSPHOSITE
4,ABL1,CDK5,10896159|11113134|14757045|15174125|18689816|1...,HPRD|PHOSPHOELM|PHOSPHOPOINT|PHOSPHOSITE|SAVI|...


## Comparison and overlap

In [59]:
data_sets= [mint_df, biogrid_df, innatedb_df, dip_df, mentha_df, intact_df, savi_df, kea_df]
data_sets_names= ['mint', 'biogrid', 'innatedb', 'dip', 'mentha', 'intact', 'savi', 'kea']

comp_df = pd.DataFrame(columns=data_sets_names, index=data_sets_names)
comp_df.columns.name = 'Overlap (Human)'
for i in np.arange(len(data_sets)):
    for j in np.arange(len(data_sets)):
        comp = pd.concat([data_sets[i][['Aliases for A', 'Aliases for B']], data_sets[j][['Aliases for A', 'Aliases for B']]])
        comp_df.ix[data_sets_names[i], data_sets_names[j]] = comp.duplicated().sum()

# mint_df[['Aliases for A', 'Aliases for B']].duplicated().sum()
# biogrid_df[['Aliases for A', 'Aliases for B']].duplicated().sum()
comp_df

Overlap (Human),mint,biogrid,innatedb,dip,mentha,intact,savi,kea
mint,5183,2072,299,302,2761,5058,70,413
biogrid,2072,43586,1870,1482,21695,4359,305,2488
innatedb,299,1870,4754,256,1684,598,57,423
dip,302,1482,256,4463,1741,665,40,228
mentha,2761,21695,1684,1741,49501,5150,415,2051
intact,5058,4359,598,665,5150,10998,104,728
savi,70,305,57,40,415,104,1378,364
kea,413,2488,423,228,2051,728,364,8061


## Merged Network Table

In [61]:
mint_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/mint_human.csv', sep='\t')
biogrid_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/biogrid_human.csv', sep='\t')
innatedb_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/innatedb_human.csv', sep='\t')
dip_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/dip_human.csv', sep='\t')
mentha_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/mentha_human.csv', sep='\t')
intact_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/intact_human.csv', sep='\t')
savi_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/savi_human.csv', sep='\t')
kea_df = pd.read_csv('/Users/moshesilverstein/Desktop/PPI Library/IntProc/kea_human.csv', sep='\t')

In [62]:
network_df = pd.concat([mint_df, biogrid_df, innatedb_df, dip_df, mentha_df, intact_df, savi_df, kea_df])

In [63]:
network_df.rename(columns={'Aliases for A':'Protein A (gene name)', 'Aliases for B':'Protein B (gene name)', 'Identifier of the publication':'PubMed ID'}, inplace=True)

In [64]:
network_df.sort_values(['Protein A (gene name)', 'Protein B (gene name)'], inplace=True)

In [65]:
network_df.reset_index(inplace=True)
network_df.drop('index', axis=1, inplace=True)

In [66]:
network_df.shape

(127722, 5)

In [None]:
network_df.reset_index(inplace=True)
network_df.drop('index', axis=1, inplace=True)
oldSize = network_df.shape
newSize = 0
while oldSize != newSize:
    network_df.reset_index(inplace=True)
    network_df.drop('index', axis=1, inplace=True)
    # concattonate like terms and remove duplicates
    i = 1
    end = len(network_df.index)-1
    for index in network_df.index:
        if i % 100 == 0:
            print('\r', str(i)+' Out of '+ str(len(network_df.index)), end=' ', flush=True)
        if index in network_df.index.values and index != end:
            if network_df.ix[index, 'Protein A (gene name)'] == network_df.ix[index+1, 'Protein A (gene name)']:
                if network_df.ix[index, 'Protein B (gene name)'] == network_df.ix[index+1, 'Protein B (gene name)']:
                    network_df.ix[index, 'PubMed ID'] = str(network_df.ix[index, 'PubMed ID']) +'|'+str(network_df.ix[index+1, 'PubMed ID'])
                    network_df.ix[index, 'Source databases'] = network_df.ix[index, 'Source databases'] +'|'+network_df.ix[index+1, 'Source databases']
                    network_df.drop(index+1, axis=0, inplace=True)
    i += 1
    oldSize = newSize
    newSize = network_df.shape

In [11]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/network_human_%s.tsv'% str(datetime.date.today()).replace('-', '_')
network_df.to_csv(filename, sep='\t', index=False) 

In [12]:
filename = '/Users/moshesilverstein/Desktop/PPI Library/Output/network_human_%s.tsv.gz'% str(datetime.date.today()).replace('-', '_')
network_df.to_csv(filename, sep='\t', index=False, compression='gzip') 

In [None]:
network_df.head(30)

## Merged DataFrame 

In [None]:
# drop PPI to self for the DIP data 
for index in dip_df.index:
    if dip_df.ix[index, 'Aliases for A'] == dip_df.ix[index, 'Aliases for B']:
        dip_df.drop(index, inplace=True)

In [None]:
df = pd.concat([mint_df, biogrid_df, innatedb_df, dip_df, mentha_df, intact_df, savi_df])

In [None]:
df = df[['Aliases for A', 'Aliases for B']]

In [None]:
df.shape

In [None]:
df.drop_duplicates(['Aliases for A', 'Aliases for B'], inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.head()

## Gene Set Library

In [None]:
# build library

i = 1

genelist = set(df['Aliases for A'].values.tolist() + df['Aliases for B'].values.tolist())

lib = pd.DataFrame()

for gene in genelist:
    if i % 100 == 0:
        print('\r', str(i)+' Out of '+ str(len(genelist)), end=' ', flush=True)
    lst = set(df[df.values == gene].values.flatten().tolist())
    lst.remove(gene)
    lst = list(lst)
    lst.insert(0, gene)
    if len(lst) >= 50:
        temp = pd.DataFrame(data=lst)
        lib = pd.concat([lib, temp.T])
    i += 1

In [None]:
lib.to_csv('PPI_HUMAN.gmt', sep='\t', header=None, index=False)

In [None]:
lib.head(20)

In [None]:
lib.shape