# JENSEN LAB (DISEASE)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://diseases.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/my_functions.py'>

# LOAD DATA

In [3]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Disease', 'Source', 'SampleInfo', 'Value']

In [4]:
dfE = pd.read_csv('Input/human_disease_experiments_filtered.tsv', sep='\t', names=col)

In [5]:
dfK = pd.read_csv('Input/human_disease_knowledge_filtered.tsv', sep='\t', names=col)

In [6]:
dfT = pd.read_csv('Input/human_disease_textmining_filtered.tsv', sep='\t', names=col)

In [7]:
df = pd.concat([dfE, dfK, dfT])

In [8]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Disease,Source,SampleInfo,Value
0,ENSP00000000412,M6PR,DOID:2377,Multiple sclerosis,DistiLD,p-value = 5e-14,2
1,ENSP00000000442,ESRRA,DOID:305,Carcinoma,COSMIC,11 samples,0
2,ENSP00000001008,FKBP4,DOID:305,Carcinoma,COSMIC,14 samples,0
3,ENSP00000001146,CYP26B1,DOID:305,Carcinoma,COSMIC,14 samples,0
4,ENSP00000002165,FUCA2,DOID:305,Carcinoma,COSMIC,10 samples,0


In [9]:
df.shape

(72762, 7)

# Get Only Gene-Tissue Data

In [10]:
df = df[['GeneSym', 'Disease']]

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.set_index('GeneSym', inplace=True)

In [13]:
df.head()

Unnamed: 0_level_0,Disease
GeneSym,Unnamed: 1_level_1
M6PR,Multiple sclerosis
ESRRA,Carcinoma
FKBP4,Carcinoma
CYP26B1,Carcinoma
FUCA2,Carcinoma


In [14]:
df.shape

(69735, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [15]:
mf.mapgenesymbols(df)

Progeres: 100%  69735 Out of 69735   

In [16]:
df.shape

(67957, 1)

# Create Binary Matrix

In [17]:
df.reset_index(inplace=True)

In [18]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  16089 Out of 16089   

In [19]:
binary_matrix.head()

Unnamed: 0,Gastroesophageal reflux disease,Protein C deficiency,Schistosomiasis,Lemierre's syndrome,Choroid cancer,Acrocephalosyndactylia,Juvenile dermatitis herpetiformis,Kidney disease,Legg-Calve-Perthes Disease,Carbuncle,...,Tinea pedis,Diffuse lipomatosis,Histidinemia,Musculoskeletal system disease,Atrioventricular block,Cerebral lipidosis,Dyspepsia,Leydig cell tumor,Peroxisomal disease,Frontal sinus cancer
NOL4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RFLNA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OPRPN,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRPF31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHM,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
binary_matrix.shape

(16089, 3690)

# Save Binary Matrix

In [21]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [22]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/Output/'

In [23]:
name = 'jensen_disease_gene_set'

In [24]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3690 Out of 3690   

# Create Attribute Library

In [25]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/Output/'

In [26]:
name = 'jensen_disease_attribute_set'

In [27]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  16089 Out of 16089   

# Create Gene Similarity Matrix

In [28]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [29]:
gene_similarity_matix.head()

Unnamed: 0,NOL4,RFLNA,OPRPN,PRPF31,CHM,E2F8,PARPBP,NRP1,RNU11,PTDSS2,...,DYNC1I1,C14orf177,RASA3,MCCC2,TBX6,FEM1B,MGAT4B,CCDC7,CACNB2,ABCA7
NOL4,1.0,0.0,0.0,0.125,0.142857,0.2,0.125,0.166667,0.0,0.25,...,0.333333,0.0,0.285714,0.125,0.0,0.25,0.25,0.25,0.181818,0.166667
RFLNA,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OPRPN,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRPF31,0.125,0.0,0.0,1.0,0.125,0.166667,0.111111,0.142857,0.0,0.2,...,0.125,0.0,0.111111,0.111111,0.0,0.2,0.2,0.2,0.076923,0.071429
CHM,0.142857,0.0,0.0,0.125,1.0,0.2,0.125,0.166667,0.0,0.25,...,0.142857,0.0,0.125,0.125,0.0,0.25,0.25,0.25,0.083333,0.076923


## Save Gene Similarity Matrix 

In [30]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [31]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [32]:
attribute_similarity_matix.head()

Unnamed: 0,Gastroesophageal reflux disease,Protein C deficiency,Schistosomiasis,Lemierre's syndrome,Choroid cancer,Acrocephalosyndactylia,Juvenile dermatitis herpetiformis,Kidney disease,Legg-Calve-Perthes Disease,Carbuncle,...,Tinea pedis,Diffuse lipomatosis,Histidinemia,Musculoskeletal system disease,Atrioventricular block,Cerebral lipidosis,Dyspepsia,Leydig cell tumor,Peroxisomal disease,Frontal sinus cancer
Gastroesophageal reflux disease,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005051,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.183673,0.0,0.0,0.0
Protein C deficiency,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015584,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Schistosomiasis,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.029925,0.0,0.0,...,0.0,0.0,0.0,0.021739,0.0,0.0,0.013889,0.0,0.0,0.0
Lemierre's syndrome,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Choroid cancer,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [33]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [34]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  16089 Out of 16089   

In [35]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,NOL4,8715
1,RFLNA,144347
2,OPRPN,58503
3,PRPF31,26121
4,CHM,1121


In [36]:
gene_list.shape

(16089, 2)

### Save Gene List

In [38]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [39]:
attribute_list = mf.createAttributeList(binary_matrix)

In [40]:
attribute_list.head()

Unnamed: 0,Attributes
0,Gastroesophageal reflux disease
1,Protein C deficiency
2,Schistosomiasis
3,Lemierre's syndrome
4,Choroid cancer


In [41]:
attribute_list.shape

(3690, 1)

### Save Attribute List

In [42]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [44]:
gene_attribute_edge_list = mf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  3690 Out of 3690   

In [45]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,Gastroesophageal reflux disease,NOL4,8715,0.0
1,Gastroesophageal reflux disease,RFLNA,144347,0.0
2,Gastroesophageal reflux disease,OPRPN,58503,0.0
3,Gastroesophageal reflux disease,PRPF31,26121,0.0
4,Gastroesophageal reflux disease,CHM,1121,0.0


In [46]:
gene_attribute_edge_list.shape

(59368410, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [47]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(67890, 4)

### Save Gene-Attribute Edge List

In [48]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')