# CTD -Comparative Toxicogenomics Database- (Gene Chemical Interactions)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Downloaded: 3-10-2017 <br/>
Data Source: http://ctdbase.org/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/CTD/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/CTD_chem_gene_ixns.tsv', sep='\t', skiprows=27)

In [4]:
df.head()

Unnamed: 0,# ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
0,#,,,,,,,,,,
1,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287.0
2,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287.0
3,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 analog results in decreased expressio...,decreases^expression,26036281.0
4,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased activity of MYC ...,decreases^activity,25716159.0


In [5]:
df.shape

(1415701, 11)

# Get Relevent Data

In [6]:
df = df[['GeneSymbol', '# ChemicalName']].copy()

In [7]:
df.drop(0, axis=0, inplace=True)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.head()

Unnamed: 0,GeneSymbol,# ChemicalName
1,MAX,10074-G5
3,MYC,10074-G5
8,KCNQ1,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone"
9,KCNQ2,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone"
11,KCNQ3,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone"


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [10]:
df.set_index('GeneSymbol', inplace=True)

In [11]:
mf.mapgenesymbols(df)

Progeres: 100%  846574 Out of 846574   

In [12]:
df.shape

(781505, 1)

# Drop Duplicates

In [13]:
df.reset_index(inplace=True)

In [14]:
df.drop_duplicates(inplace=True)

In [15]:
df.shape

(779022, 2)

# Create Binary Matrix

In [16]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  21810 Out of 21810   

In [17]:
binary_matrix.head()

Unnamed: 0,Maleic Anhydrides,Hydroxyprogesterones,HS 1200,ginsenoside Rd,geranylgeraniol,"6-methyl-1,3,8-trichlorodibenzofuran",hydroxyoctadecadienoic acid,SEA 0400,Y 27632,3-methyl-N-phenyl-N-(3-(piperidin-1-yl)propyl)benzofuran-2-carboxamide,...,trisialoganglioside GT1,"quercetin 3-acetyl-7,3',4'-trisulfate",2-amino-3-hydroxy-2-hydroxymethylpropyl 3-acetylbetulinoate,vaninolol,tetralol,"2,5-diaziridinyl-3-(hydroxymethyl)-6-methyl-1,4-benzoquinone",S-Adenosylhomocysteine,palonosetron,"2,3-dimethylsuccinic acid",Amlodipine
OR10V1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZAR1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ARL3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SNORD100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MYLK-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
binary_matrix.shape

(21810, 12020)

# Save Binary Matrix

In [19]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_process_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [20]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [21]:
name = 'ctd_chemical_gene_set'

In [22]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  12020 Out of 12020   

# Create Attribute Library

In [23]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [24]:
name = 'ctd_chemical_attribute_set'

In [25]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  21810 Out of 21810   

# Create Gene Similarity Matrix

In [26]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [27]:
gene_similarity_matix.head()

Unnamed: 0,OR10V1,ZAR1,ARL3,SNORD100,MYLK-AS1,CD99L2,MTHFD2,SEPT7P2,RBM20,MIR302F,...,PRRG4,FAM213B,SLCO2A1,OR2L8,SP7,CD6,ZNF395,ARPC4,CELSR3,TXNIP
OR10V1,1.0,0.076923,0.043478,0.0,0.0,0.057692,0.028037,0.0,0.045455,0.0,...,0.041667,0.083333,0.041096,0.2,0.015625,0.034483,0.076923,0.021277,0.047619,0.02139
ZAR1,0.076923,1.0,0.058824,0.0,0.0,0.089286,0.045045,0.0,0.074074,0.0,...,0.098039,0.066667,0.037975,0.090909,0.028986,0.028571,0.116279,0.058824,0.020408,0.036842
ARL3,0.043478,0.058824,1.0,0.0,0.022727,0.202532,0.162791,0.06383,0.105263,0.0,...,0.232877,0.064516,0.171717,0.022222,0.071429,0.09375,0.205882,0.189189,0.135135,0.115942
SNORD100,0.0,0.0,0.0,1.0,0.0,0.019608,0.009434,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.025,0.005348
MYLK-AS1,0.0,0.0,0.022727,0.0,1.0,0.019608,0.009434,0.0,0.052632,0.0,...,0.021739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005348


# Save Gene Similarity Matrix

In [28]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [29]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [30]:
attribute_similarity_matix.head()

Unnamed: 0,Maleic Anhydrides,Hydroxyprogesterones,HS 1200,ginsenoside Rd,geranylgeraniol,"6-methyl-1,3,8-trichlorodibenzofuran",hydroxyoctadecadienoic acid,SEA 0400,Y 27632,3-methyl-N-phenyl-N-(3-(piperidin-1-yl)propyl)benzofuran-2-carboxamide,...,trisialoganglioside GT1,"quercetin 3-acetyl-7,3',4'-trisulfate",2-amino-3-hydroxy-2-hydroxymethylpropyl 3-acetylbetulinoate,vaninolol,tetralol,"2,5-diaziridinyl-3-(hydroxymethyl)-6-methyl-1,4-benzoquinone",S-Adenosylhomocysteine,palonosetron,"2,3-dimethylsuccinic acid",Amlodipine
Maleic Anhydrides,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0
Hydroxyprogesterones,0.0,1.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HS 1200,0.0,0.0,1.0,0.088235,0.0,0.034483,0.0,0.0,0.045872,0.0,...,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.111111
ginsenoside Rd,0.0,0.0,0.088235,1.0,0.0,0.035714,0.045455,0.0,0.066038,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.027027,0.0,0.0,0.067568
geranylgeraniol,0.0,0.052632,0.0,0.0,1.0,0.0,0.0,0.0,0.036697,0.0,...,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.012821


# Save Attribute Similarity Matrix

In [31]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [32]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  21810 Out of 21810   

In [33]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,OR10V1,390201.0
1,ZAR1,326340.0
2,ARL3,403.0
3,SNORD100,594838.0
4,MYLK-AS1,100507000.0


In [34]:
gene_list.shape

(21810, 2)

# Save Gene List

In [36]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [37]:
attribute_list = mf.createAttributeList(binary_matrix)

In [38]:
attribute_list.head()

Unnamed: 0,Attributes
0,Maleic Anhydrides
1,Hydroxyprogesterones
2,HS 1200
3,ginsenoside Rd
4,geranylgeraniol


In [39]:
attribute_list.shape

(12020, 1)

# Save Attribute List

In [40]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [41]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [42]:
name = 'ctd_chemical_gene_attribute_edge_list'

In [43]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  12020 Out of 12020   

 The number of statisticaly relevent gene-attribute associations is: 779022
