# JENSEN LAB (DISEASE)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://diseases.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/untility_functions.py'>

# LOAD DATA

In [3]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Disease', 'Source', 'SampleInfo', 'Value']

In [4]:
dfE = pd.read_csv('Input/human_disease_experiments_filtered.tsv', sep='\t', names=col)

In [5]:
dfK = pd.read_csv('Input/human_disease_knowledge_filtered.tsv', sep='\t', names=col)

In [6]:
dfT = pd.read_csv('Input/human_disease_textmining_filtered.tsv', sep='\t', names=col)

In [7]:
df = pd.concat([dfE, dfK, dfT])

In [8]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Disease,Source,SampleInfo,Value
0,ENSP00000000412,M6PR,DOID:2377,Multiple sclerosis,DistiLD,p-value = 5e-14,2
1,ENSP00000000442,ESRRA,DOID:305,Carcinoma,COSMIC,11 samples,0
2,ENSP00000001008,FKBP4,DOID:305,Carcinoma,COSMIC,14 samples,0
3,ENSP00000001146,CYP26B1,DOID:305,Carcinoma,COSMIC,14 samples,0
4,ENSP00000002165,FUCA2,DOID:305,Carcinoma,COSMIC,10 samples,0


In [9]:
df.shape

(72762, 7)

# Get Only Gene-Tissue Data

In [10]:
df = df[df['Value'] != 0]

In [11]:
df = df[['GeneSym', 'Disease']]

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.set_index('GeneSym', inplace=True)

In [14]:
df.head()

Unnamed: 0_level_0,Disease
GeneSym,Unnamed: 1_level_1
M6PR,Multiple sclerosis
HS3ST1,Liver disease
CFTR,Carcinoma
USP28,Carcinoma
BAIAP2L1,Prostate cancer


In [15]:
df.shape

(53714, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
uf.mapgenesymbols(df)

Progeres: 99%  53601 Out of 53714   

In [17]:
df.shape

# Create Binary Matrix

In [18]:
df.reset_index(inplace=True)

In [19]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  13149 Out of 13149   

In [20]:
binary_matrix.head()

Unnamed: 0,Hartnup disease,Hordeolum,Rh isoimmunization,Diffuse idiopathic skeletal hyperostosis,Contagious pustular dermatitis,Reticular dysgenesis,Frontal sinus cancer,Congenital afibrinogenemia,Insulinoma,Hyperlysinemia,...,Glycoproteinosis,Bart-Pumphrey syndrome,Simpson-Golabi-Behmel syndrome,brachydactyly-syndactyly syndrome,Inverted follicular keratosis,Mucosulfatidosis,Chronic intestinal vascular insufficiency,inclusion-cell disease,Gingivitis,Myositis ossificans
FZD3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
INTS11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CTDP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PHKG1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CCDC155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
binary_matrix.shape

(13149, 3679)

# Save Binary Matrix

In [22]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [23]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/Output/'

In [24]:
name = 'jensen_disease_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3679 Out of 3679   

# Create Attribute Library

In [26]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Disease/Output/'

In [27]:
name = 'jensen_disease_attribute_set'

In [28]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  13149 Out of 13149   

# Create Gene Similarity Matrix

In [29]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [30]:
gene_similarity_matix.head()

Unnamed: 0,FZD3,INTS11,CTDP1,PHKG1,CCDC155,SELENOO,NLRP11,TMEM74,INHBE,RSBN1,...,ADCK2,TUBA1B,OGDH,TIFAB,TRIM25,FGF4,USP36,SKP1,SH2D4B,LINC00982
FZD3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
INTS11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CTDP1,0.0,0.0,1.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PHKG1,0.0,0.0,0.166667,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CCDC155,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Gene Similarity Matrix 

In [31]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [32]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [33]:
attribute_similarity_matix.head()

Unnamed: 0,Hartnup disease,Hordeolum,Rh isoimmunization,Diffuse idiopathic skeletal hyperostosis,Contagious pustular dermatitis,Reticular dysgenesis,Frontal sinus cancer,Congenital afibrinogenemia,Insulinoma,Hyperlysinemia,...,Glycoproteinosis,Bart-Pumphrey syndrome,Simpson-Golabi-Behmel syndrome,brachydactyly-syndactyly syndrome,Inverted follicular keratosis,Mucosulfatidosis,Chronic intestinal vascular insufficiency,inclusion-cell disease,Gingivitis,Myositis ossificans
Hartnup disease,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hordeolum,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rh isoimmunization,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Diffuse idiopathic skeletal hyperostosis,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Contagious pustular dermatitis,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [34]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  13149 Out of 13149   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,FZD3,7976
1,INTS11,54973
2,CTDP1,9150
3,PHKG1,5260
4,CCDC155,147872


In [37]:
gene_list.shape

(13149, 2)

### Save Gene List

In [38]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [39]:
attribute_list = uf.createAttributeList(binary_matrix)

In [40]:
attribute_list.head()

Unnamed: 0,Attributes
0,Hartnup disease
1,Hordeolum
2,Rh isoimmunization
3,Diffuse idiopathic skeletal hyperostosis
4,Contagious pustular dermatitis


In [41]:
attribute_list.shape

(3679, 1)

### Save Attribute List

In [42]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [43]:
gene_attribute_edge_list = uf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  3679 Out of 3679   

In [44]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,Hartnup disease,FZD3,7976,0.0
1,Hartnup disease,INTS11,54973,0.0
2,Hartnup disease,CTDP1,9150,0.0
3,Hartnup disease,PHKG1,5260,0.0
4,Hartnup disease,CCDC155,147872,0.0


In [45]:
gene_attribute_edge_list.shape

(48375171, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [46]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(52079, 4)

### Save Gene-Attribute Edge List

In [47]:
filename = '~/./Documents/Harmonizome/Jensen/Disease/Output/jensen_disease_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')