# GeneSigDB

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://www.genesigdb.org

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GeneSigDB/untility_functions.py'>

# Load Data

In [8]:
df = pd.read_csv('Input/ALL_SIGSv4.gmt', header=None)

In [9]:
df.head()

Unnamed: 0,0
0,19930681-Table2-3\tHuman Breast_Cecco09_17gene...
1,16760442-SuppTable5-1\tHuman Lymphoma_Hummel06...
2,19930681-Table2-4\tHuman Breast_Cecco09_2genes...
3,16760442-SuppTable5-2\tHuman Lymphoma_Hummel06...
4,16735089-Table1\tHuman Breast_Amin06_82genes_S...


In [10]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 0].split('\t')[1:]
    lst1 = [df.ix[index, 0].split('\t')[0].split('_')[0]]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['PubmedSource'] = lst1
    temp['Gene'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  3515 Out of 3515   

In [11]:
df_interactions.head()

Unnamed: 0,PubmedSource,Gene
0,19930681-Table2-3,ST6GAL1
1,19930681-Table2-3,RAB21
2,19930681-Table2-3,ESR1
3,19930681-Table2-3,MYO1C
4,19930681-Table2-3,LRBA


In [12]:
df_interactions.shape

(425763, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [13]:
df_interactions.set_index('Gene', inplace=True)

In [14]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  425763 Out of 425763   

# Drop Duplicates

In [15]:
df_interactions.reset_index(inplace=True)

In [16]:
df_interactions.drop_duplicates(inplace=True)

In [17]:
df_interactions.shape

(404516, 2)

# Create Binary Matrix

In [18]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  18534 Out of 18534   

In [19]:
binary_matrix.head()

Unnamed: 0,17880687-TableS1,19933690-TableS2,15902281-TableS1a,18662380-S3-ERBB2,18288381-Table4,18510698-Table2,18160667-TableS4,12036940-Table2,18855877-Table2,20068100-TableS5,...,19936789-Table1,16449190-Table1,19112599-Table2b,15073102-Table1d,15548366-Table3,20679228-TableS5,16195394-Table2,15831674-Table3,15246160-table4,17205517-Top100GoodPrognosisGenes
LIN7C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR5D18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZYG11B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PTMAP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
INHBB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
binary_matrix.shape

(18534, 3508)

# Save Binary Matrix

In [21]:
filename = '~/./Documents/Harmonizome/GeneSigDB/Output/genesigdb_idg_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [22]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneSigDB/Output/'

In [23]:
name = 'genesigdb_idg_gene_set'

In [24]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3508 Out of 3508   

# Create Attribute Library

In [25]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneSigDB/Output/'

In [26]:
name = 'genesigdb_idg_attribute_set'

In [27]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18534 Out of 18534   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,LIN7C,OR5D18,ZYG11B,PTMAP1,INHBB,TRBV10-2,ZNF488,ZSCAN4,ZMIZ2,ZC4H2,...,CIT,KAT6A,SSFA2,PTMAP3,KRT12,SNAP29,LMNTD1,EDA2R,PI16,CXCL16
LIN7C,1.0,0.0,0.151515,0.0,0.021277,0.041667,0.0,0.0,0.026316,0.0,...,0.029412,0.046875,0.0,0.0,0.0,0.076923,0.0,0.032258,0.0,0.051724
OR5D18,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZYG11B,0.151515,0.0,1.0,0.0,0.035294,0.0,0.0,0.0,0.0,0.0,...,0.016393,0.0,0.021739,0.0,0.0,0.030303,0.0,0.0,0.047619,0.019231
PTMAP1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021277,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
INHBB,0.021277,0.0,0.035294,0.0,1.0,0.013514,0.023529,0.0,0.0,0.02439,...,0.025641,0.054054,0.060606,0.0,0.039474,0.010989,0.0,0.037975,0.0,0.037383


# Save Gene Similarity Matrix

In [30]:
filename = '~/./Documents/Harmonizome/GeneSigDB/Output/genesigdb_idg_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,17880687-TableS1,19933690-TableS2,15902281-TableS1a,18662380-S3-ERBB2,18288381-Table4,18510698-Table2,18160667-TableS4,12036940-Table2,18855877-Table2,20068100-TableS5,...,19936789-Table1,16449190-Table1,19112599-Table2b,15073102-Table1d,15548366-Table3,20679228-TableS5,16195394-Table2,15831674-Table3,15246160-table4,17205517-Top100GoodPrognosisGenes
17880687-TableS1,1.0,0.0,0.026127,0.001754,0.010318,0.002622,0.010161,0.001729,0.000877,0.013633,...,0.003493,0.004371,0.0,0.001765,0.003463,0.002641,0.001756,0.00604,0.003506,0.009167
19933690-TableS2,0.0,1.0,0.003922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15902281-TableS1a,0.026127,0.003922,1.0,0.0,0.004016,0.004255,0.003876,0.0,0.0,0.008562,...,0.004237,0.004237,0.0,0.002179,0.00207,0.00216,0.0,0.00409,0.0,0.009416
18662380-S3-ERBB2,0.001754,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0
18288381-Table4,0.010318,0.0,0.004016,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010638,0.0,0.007194


# Save Attribute Similarity Matrix

In [33]:
filename = '~/./Documents/Harmonizome/GeneSigDB/Output/genesigdb_idg_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [34]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18534 Out of 18534   

In [35]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,LIN7C,55327
1,OR5D18,219438
2,ZYG11B,79699
3,PTMAP1,5758
4,INHBB,3625


In [36]:
gene_list.shape

(18534, 2)

# Save Gene List

In [37]:
filename = '~/./Documents/Harmonizome/GeneSigDB/Output/genesigdb_idg_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [38]:
attribute_list = uf.createAttributeList(binary_matrix)

In [39]:
attribute_list.head()

Unnamed: 0,Attributes
0,17880687-TableS1
1,19933690-TableS2
2,15902281-TableS1a
3,18662380-S3-ERBB2
4,18288381-Table4


In [40]:
attribute_list.shape

(3508, 1)

# Save Attribute List

In [41]:
filename = '~/./Documents/Harmonizome/GeneSigDB/Output/genesigdb_idg_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneSigDB/Output/'

In [43]:
name = 'genesigdb_idg_gene_attribute_edge_list'

In [44]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  3508 Out of 3508   

 The number of statisticaly relevent gene-attribute associations is: 404516
