# CTD -Comparative Toxicogenomics Database- (Gene Chemical Interactions)

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Downloaded: 3-10-2017 <br/>
Data Source: http://ctdbase.org/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/CTD/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/CTD_chem_gene_ixns.tsv', sep='\t', skiprows=27)

In [4]:
df.head()

Unnamed: 0,# ChemicalName,ChemicalID,CasRN,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,PubMedIDs
0,#,,,,,,,,,,
1,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 affects the folding of and results in...,affects^binding|affects^folding|decreases^acti...,26474287.0
2,10074-G5,C534883,,MAX,4149.0,protein,,,10074-G5 inhibits the reaction [MYC protein bi...,affects^binding|decreases^reaction,26474287.0
3,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 analog results in decreased expressio...,decreases^expression,26036281.0
4,10074-G5,C534883,,MYC,4609.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased activity of MYC ...,decreases^activity,25716159.0


In [5]:
df.shape

(1415701, 11)

# Get Relevent Data

In [6]:
# discard interactions that are not from human, mouse, or rat data
mouse = df[df['OrganismID'] == 10090].copy()
human = df[df['OrganismID'] == 9606].copy()
rat = df[df['OrganismID'] == 10116].copy()

df = pd.concat([mouse, human])
df = pd.concat([df, rat])

In [7]:
# discard interactions that are not gene or protein type (alternative is mRNA, i.e. chemical regulates expression)
protein = df[df['GeneForms'] == 'protein'].copy()
gene = df[df['GeneForms'] == 'gene'].copy()

df = pd.concat([protein, gene])

In [8]:
df = df[['GeneSymbol', '# ChemicalName']].copy()

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.head()

Unnamed: 0,GeneSymbol,# ChemicalName
9,KCNQ2,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone"
64,ACHE,10-(fluoroethoxyphosphinyl)-N-(biotinamidopent...
66,ALB,10-(fluoroethoxyphosphinyl)-N-(biotinamidopent...
67,ATP5B,10-(fluoroethoxyphosphinyl)-N-(biotinamidopent...
69,BCHE,10-(fluoroethoxyphosphinyl)-N-(biotinamidopent...


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [11]:
df.set_index('GeneSymbol', inplace=True)

In [12]:
uf.mapgenesymbols(df)

Progeres: 100%  165678 Out of 165678   

In [13]:
df.shape

(160501, 1)

# Drop Duplicates

In [14]:
df.reset_index(inplace=True)

In [15]:
df.drop_duplicates(inplace=True)

In [16]:
df.shape

(160332, 2)

# Create Binary Matrix

In [17]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  15676 Out of 15676   

In [18]:
binary_matrix.head()

Unnamed: 0,"2-cyano-3,12-dioxoolean-1,9-dien-28-oic acid","1,1,1,2-tetrafluoro-2-chloroethane",chalcone epoxide,antroquinonol,7-hydroxymethotrexate,dichlobanil,5-hydroxydopamine,iloperidone,"N,N''-1,4-butanediylbis(N'-(3-isothiocyanatophenyl))thiourea","1,2,3,4,6,7,8-heptachlorodibenzofuran",...,diethylene glycol,Trapidil,Bresol,Testosterone Propionate,"2-amino-1-methyl-6-phenylimidazo(4,5-b)pyridine",arsenite,methyl bensulfuron,vorozole,phosphatidylbutanol,1-methylpyrene
ESRRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BHMT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETNK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GDF9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ICAM4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
binary_matrix.shape

(15676, 10366)

# Save Binary Matrix

In [20]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [21]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [22]:
name = 'ctd_chemical_gene_set'

In [23]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  10366 Out of 10366   

# Create Attribute Library

In [24]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [25]:
name = 'ctd_chemical_attribute_set'

In [26]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  15676 Out of 15676   

# Create Gene Similarity Matrix

In [27]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [28]:
gene_similarity_matix.head()

Unnamed: 0,ESRRG,BHMT2,ETNK1,GDF9,ICAM4,PPP4R4,WNT4,PLAC8,GALNT15,GOLM1,...,FFAR2,KYAT3,OR4C6,TIGD2,CEACAM20,GAD2,SLFN11,PDIA6,SRPK3,NEU2
ESRRG,1.0,0.0625,0.0,0.0625,0.033333,0.071429,0.058824,0.071429,0.107143,0.035714,...,0.035714,0.064516,0.035714,0.035714,0.035714,0.020833,0.0,0.061224,0.035714,0.034483
BHMT2,0.0625,1.0,0.0,0.0,0.125,0.142857,0.0,0.142857,0.125,0.0,...,0.0,0.222222,0.166667,0.166667,0.166667,0.0,0.0,0.071429,0.0,0.0
ETNK1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GDF9,0.0625,0.0,0.0,1.0,0.0,0.142857,0.076923,0.142857,0.0,0.166667,...,0.166667,0.1,0.0,0.0,0.0,0.0,0.0,0.034483,0.166667,0.142857
ICAM4,0.033333,0.125,0.0,0.0,1.0,0.25,0.0,0.25,0.2,0.0,...,0.0,0.142857,0.333333,0.333333,0.333333,0.0,0.0,0.08,0.0,0.0


# Save Gene Similarity Matrix

In [29]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [30]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [31]:
attribute_similarity_matix.head()

Unnamed: 0,"2-cyano-3,12-dioxoolean-1,9-dien-28-oic acid","1,1,1,2-tetrafluoro-2-chloroethane",chalcone epoxide,antroquinonol,7-hydroxymethotrexate,dichlobanil,5-hydroxydopamine,iloperidone,"N,N''-1,4-butanediylbis(N'-(3-isothiocyanatophenyl))thiourea","1,2,3,4,6,7,8-heptachlorodibenzofuran",...,diethylene glycol,Trapidil,Bresol,Testosterone Propionate,"2-amino-1-methyl-6-phenylimidazo(4,5-b)pyridine",arsenite,methyl bensulfuron,vorozole,phosphatidylbutanol,1-methylpyrene
"2-cyano-3,12-dioxoolean-1,9-dien-28-oic acid",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009901,0.010417,0.0,0.0,0.0,0.0
"1,1,1,2-tetrafluoro-2-chloroethane",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
chalcone epoxide,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
antroquinonol,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.111111,0.0,...,0.0,0.0,0.0,0.0,0.06422,0.045226,0.0,0.0,0.0,0.0
7-hydroxymethotrexate,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009804,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [32]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [33]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  15676 Out of 15676   

In [34]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ESRRG,2104
1,BHMT2,23743
2,ETNK1,55500
3,GDF9,2661
4,ICAM4,3386


In [35]:
gene_list.shape

(15676, 2)

# Save Gene List

In [36]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [37]:
attribute_list = uf.createAttributeList(binary_matrix)

In [38]:
attribute_list.head()

Unnamed: 0,Attributes
0,"2-cyano-3,12-dioxoolean-1,9-dien-28-oic acid"
1,"1,1,1,2-tetrafluoro-2-chloroethane"
2,chalcone epoxide
3,antroquinonol
4,7-hydroxymethotrexate


In [39]:
attribute_list.shape

(10366, 1)

# Save Attribute List

In [40]:
filename = '~/./Documents/Harmonizome/CTD/Output/ctd_chemical_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [41]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CTD/Output/'

In [42]:
name = 'ctd_chemical_gene_attribute_edge_list'

In [43]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  10366 Out of 10366   

 The number of statisticaly relevent gene-attribute associations is: 160332
