# CTD -Comparative Toxicogenomics Database- (Gene Disease Interactions)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Downloaded: 3-10-2017 <br/>
Data Source: http://ctdbase.org/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import goenrich
%matplotlib inline

In [25]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/CTD/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/CTD_genes_diseases.tsv', sep='\t', skiprows=27)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,# GeneSymbol,GeneID,DiseaseName,DiseaseID,DirectEvidence,InferenceChemicalName,InferenceScore,OmimIDs,PubMedIDs
0,#,,,,,,,,
1,11-BETA-HSD3,100174880.0,"Abnormalities, Drug-Induced",MESH:D000014,,Endocrine Disruptors,5.16,,22659286.0
2,11-BETA-HSD3,100174880.0,"Anemia, Hemolytic",MESH:D000743,,"Water Pollutants, Chemical",4.51,,22425172.0
3,11-BETA-HSD3,100174880.0,Asthenozoospermia,MESH:D053627,,"Water Pollutants, Chemical",5.21,,25179371.0
4,11-BETA-HSD3,100174880.0,Birth Weight,MESH:D001724,,Endocrine Disruptors,5.89,,27152464.0


In [5]:
df.shape

(51373878, 9)

# Get Relevent Data

In [6]:
df = df[['# GeneSymbol', 'DiseaseName', 'InferenceScore']]

In [7]:
df.drop(0, axis=0, inplace=True)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.head()

Unnamed: 0,# GeneSymbol,DiseaseName,InferenceScore
1,11-BETA-HSD3,"Abnormalities, Drug-Induced",5.16
2,11-BETA-HSD3,"Anemia, Hemolytic",4.51
3,11-BETA-HSD3,Asthenozoospermia,5.21
4,11-BETA-HSD3,Birth Weight,5.89
5,11-BETA-HSD3,Breast Neoplasms,8.68


In [10]:
df.shape

(20161840, 3)

# Create Matrix

In [11]:
genes = df['# GeneSymbol'].unique().tolist()
tissues = df['DiseaseName'].unique().tolist()

df.set_index('DiseaseName', inplace=True)

matrix = pd.DataFrame(index=genes, columns=tissues)

matrix.replace(np.nan, 0.0, inplace=True)

for i,col in enumerate(matrix.columns):
    
    progress = ((i+1)/len(matrix.columns))*100
        
    sys.stdout.write("Progress: %d%%   \r" % (progress))
    sys.stdout.flush()
    
    if type(df.ix[col, 'InferenceScore']) != np.float64: 
        matrix.ix[df.loc[col, '# GeneSymbol'].values.tolist(), col] = df.ix[col, 'InferenceScore'].values.tolist()

Progress: 100%   

In [12]:
matrix.head()

Unnamed: 0,"Abnormalities, Drug-Induced","Anemia, Hemolytic",Asthenozoospermia,Birth Weight,Breast Neoplasms,"Cell Transformation, Neoplastic",Chromosome Aberrations,Death,Disorders of Sex Development,Fetal Death,...,Holoprosencephaly 5,"Heterotaxy, visceral, X-linked",Acro-Osteolysis,Mandibuloacral dysplasia with type B lipodystrophy,"NEPHROLITHIASIS, URIC ACID, SUSCEPTIBILITY TO",Brittle cornea syndrome 1,RETINITIS PIGMENTOSA 58,"Spinocerebellar ataxia, autosomal recessive 5",Seborrhea-Like Dermatitis with Psoriasiform Elements,"Mental Retardation, X-Linked 45"
11-BETA-HSD3,5.16,4.51,5.21,5.89,8.68,4.28,4.76,4.96,14.04,5.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128UP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130004C03,0.0,4.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14-3-3ZETA,0.0,0.0,0.0,0.0,0.0,3.86,0.0,4.86,0.0,4.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
143B1,0.0,0.0,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
matrix.shape

(42551, 6384)

# Drop Any Genes That Have Zero Inferance Across 95% Of The Samples

In [14]:
matrix.replace(0, np.nan, inplace=True)

In [15]:
matrix.dropna(thresh=(0.05*matrix.shape[1]), axis=0, inplace=True)

In [16]:
matrix.replace(np.nan, 0, inplace=True)

In [17]:
matrix.shape

(20202, 6384)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [18]:
uf.mapgenesymbols(matrix)

Progeres: 100%  20202 Out of 20202   

# Save Unfiltered Sample Matrix To File

In [19]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Normalize Matrix (Quantile Normalize the matrix for the columns)

In [20]:
normalized_matrix = matrix.copy()

In [21]:
normalized_matrix = uf.quantileNormalize(normalized_matrix)

Step 2/2 progress: 100%  6384 Out of 6384   

In [22]:
normalized_matrix.head()

Unnamed: 0_level_0,"Abnormalities, Drug-Induced","Anemia, Hemolytic",Asthenozoospermia,Birth Weight,Breast Neoplasms,"Cell Transformation, Neoplastic",Chromosome Aberrations,Death,Disorders of Sex Development,Fetal Death,...,Holoprosencephaly 5,"Heterotaxy, visceral, X-linked",Acro-Osteolysis,Mandibuloacral dysplasia with type B lipodystrophy,"NEPHROLITHIASIS, URIC ACID, SUSCEPTIBILITY TO",Brittle cornea syndrome 1,RETINITIS PIGMENTOSA 58,"Spinocerebellar ataxia, autosomal recessive 5",Seborrhea-Like Dermatitis with Psoriasiform Elements,"Mental Retardation, X-Linked 45"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.610807,0.858689,0.808271,2.327359,2.244685,1.622046,2.349109,1.324767,0.355432,0.818446,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
A1CF,0.966164,0.995755,1.618219,3.811461,1.292176,1.330196,0.569992,0.957503,0.0005,0.610642,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
A2M,4.530959,2.757498,1.291584,3.588139,3.011992,4.228456,3.15612,5.437132,1.564383,4.621808,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
A2M-AS1,0.038965,0.210948,0.0005,0.342278,0.0005,0.032823,0.0005,0.0005,0.0005,0.079958,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
A3GALT2,0.224629,0.442168,0.165356,0.189693,0.307874,0.138429,0.316065,1.792705,0.0005,0.408443,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005


# Nomalize Matrix (z-score the rows)

In [26]:
uf.zscore(normalized_matrix, 'row')

Progress: 100%  17859 Out of 17859   

In [27]:
normalized_matrix.head()

Unnamed: 0_level_0,"Abnormalities, Drug-Induced","Anemia, Hemolytic",Asthenozoospermia,Birth Weight,Breast Neoplasms,"Cell Transformation, Neoplastic",Chromosome Aberrations,Death,Disorders of Sex Development,Fetal Death,...,Holoprosencephaly 5,"Heterotaxy, visceral, X-linked",Acro-Osteolysis,Mandibuloacral dysplasia with type B lipodystrophy,"NEPHROLITHIASIS, URIC ACID, SUSCEPTIBILITY TO",Brittle cornea syndrome 1,RETINITIS PIGMENTOSA 58,"Spinocerebellar ataxia, autosomal recessive 5",Seborrhea-Like Dermatitis with Psoriasiform Elements,"Mental Retardation, X-Linked 45"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.356365,0.631936,0.575886,2.26466,2.172751,1.480561,2.288839,1.150075,0.072464,0.587198,...,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115,-0.322115
A1CF,1.387571,1.439235,2.52601,6.355246,1.956763,2.023143,0.695885,1.37245,-0.298406,0.766857,...,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406,-0.298406
A2M,2.021736,1.052069,0.250557,1.506235,1.191218,1.856338,1.270022,2.517201,0.399714,2.07141,...,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364,-0.455364
A2M-AS1,-0.04714,0.381823,-0.14308,0.709388,-0.14308,-0.06246,-0.14308,-0.14308,-0.14308,0.055105,...,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308,-0.14308
A3GALT2,0.163066,0.52335,0.064899,0.105206,0.300936,0.020304,0.314501,2.760083,-0.208132,0.467495,...,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132,-0.208132


# Merge Duplicate Genes

In [29]:
normalized_matrix = uf.merge(normalized_matrix, 'row', 'mean')

In [30]:
normalized_matrix.shape

(17487, 6384)

# Save Filtered Sample Matrix To File

In [31]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [32]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  6384 Out of 6384   

In [33]:
tertiary_matrix.head()

Unnamed: 0_level_0,"Abnormalities, Drug-Induced","Anemia, Hemolytic",Asthenozoospermia,Birth Weight,Breast Neoplasms,"Cell Transformation, Neoplastic",Chromosome Aberrations,Death,Disorders of Sex Development,Fetal Death,...,Holoprosencephaly 5,"Heterotaxy, visceral, X-linked",Acro-Osteolysis,Mandibuloacral dysplasia with type B lipodystrophy,"NEPHROLITHIASIS, URIC ACID, SUSCEPTIBILITY TO",Brittle cornea syndrome 1,RETINITIS PIGMENTOSA 58,"Spinocerebellar ataxia, autosomal recessive 5",Seborrhea-Like Dermatitis with Psoriasiform Elements,"Mental Retardation, X-Linked 45"
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
A2M-AS1,-1.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [34]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

### Path to output files

In [35]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

# Create Up Gene Set Library

In [36]:
name = 'ctd_disease_gene_up_set'

In [37]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  6384 Out of 6384   

# Create Down Gene Set Library

In [38]:
name = 'ctd_disease_gene_down_set'

In [39]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  6384 Out of 6384   

# Create Up Attribute Library

In [40]:
name = 'ctd_disease_attribute_up_set'

In [41]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  17487 Out of 17487   

# Create Down Attribute Library

In [42]:
name = 'ctd_disease_attribute_down_set'

In [43]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  17487 Out of 17487   

# Create Gene Similarity Matrix

In [44]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [45]:
gene_similarity_matix.head()

index,A1BG,A1CF,A2M,A2M-AS1,A3GALT2,A4GALT,AAAS,AACS,AACSP1,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.0,0.303973,0.405447,0.140594,0.132618,0.227267,0.137874,0.299021,0.141617,0.186324,...,0.359023,0.325918,0.171836,0.363655,0.237697,0.355295,0.267483,0.392003,0.12671,0.273335
A1CF,0.303973,1.0,0.295564,0.283639,0.154755,0.213928,0.309417,0.314987,0.238739,0.150622,...,0.437671,0.319257,0.193353,0.462223,0.369293,0.245345,0.394386,0.272106,0.175142,0.604391
A2M,0.405447,0.295564,1.0,0.076626,0.149243,0.256886,0.178016,0.361912,0.090567,0.378523,...,0.375282,0.448516,0.230601,0.288869,0.285571,0.144676,0.275111,0.375308,0.192809,0.316976
A2M-AS1,0.140594,0.283639,0.076626,1.0,0.006044,-0.001937,0.22401,0.1513,0.750167,0.021788,...,0.013759,0.085589,0.008365,0.504058,0.32775,0.446301,0.318665,0.121786,0.28656,0.334152
A3GALT2,0.132618,0.154755,0.149243,0.006044,1.0,0.225919,0.140613,0.141722,0.008183,0.175945,...,0.191548,0.216885,0.174059,0.180315,0.235605,0.052169,0.317074,0.224724,0.137546,0.310726


# Save Gene Similarity Matrix

In [46]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [47]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [48]:
attribute_similarity_matix.head()

Unnamed: 0,"Abnormalities, Drug-Induced","Anemia, Hemolytic",Asthenozoospermia,Birth Weight,Breast Neoplasms,"Cell Transformation, Neoplastic",Chromosome Aberrations,Death,Disorders of Sex Development,Fetal Death,...,Holoprosencephaly 5,"Heterotaxy, visceral, X-linked",Acro-Osteolysis,Mandibuloacral dysplasia with type B lipodystrophy,"NEPHROLITHIASIS, URIC ACID, SUSCEPTIBILITY TO",Brittle cornea syndrome 1,RETINITIS PIGMENTOSA 58,"Spinocerebellar ataxia, autosomal recessive 5",Seborrhea-Like Dermatitis with Psoriasiform Elements,"Mental Retardation, X-Linked 45"
"Abnormalities, Drug-Induced",1.0,0.745876,0.693462,0.661546,0.800474,0.746678,0.685537,0.682753,0.716713,0.783375,...,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061,-0.850061
"Anemia, Hemolytic",0.745876,1.0,0.618697,0.586727,0.726738,0.715302,0.638423,0.579657,0.555692,0.68331,...,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089,-0.839089
Asthenozoospermia,0.693462,0.618697,1.0,0.670824,0.712306,0.719125,0.685338,0.704195,0.611774,0.610687,...,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145,-0.72145
Birth Weight,0.661546,0.586727,0.670824,1.0,0.714458,0.706858,0.697926,0.732583,0.55415,0.630613,...,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413,-0.712413
Breast Neoplasms,0.800474,0.726738,0.712306,0.714458,1.0,0.832835,0.794939,0.705437,0.675439,0.726861,...,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785,-0.855785


# Save Attribute Similarity Matrix

In [49]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [50]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  17487 Out of 17487   

In [51]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1BG,1
1,A1CF,29974
2,A2M,2
3,A2M-AS1,144571
4,A3GALT2,127550


In [52]:
gene_list.shape

(17487, 2)

# Save Gene List

In [53]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [54]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [55]:
attribute_list.head()

Unnamed: 0,Attributes
0,"Abnormalities, Drug-Induced"
1,"Anemia, Hemolytic"
2,Asthenozoospermia
3,Birth Weight
4,Breast Neoplasms


In [56]:
attribute_list.shape

(6384, 1)

# Save Attribute List

In [57]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_disease_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [61]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [62]:
name = 'ctd_disease_gene_attribute_edge_list'

In [63]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  6384 Out of 6384   

 The number of statisticaly relevent gene-attribute associations is: 22325153
