# GWAS Catalog (SNP-Phenotype Associations)

Author: Moshe Silverstein <br/>
Date: 9-17 <br/>
Data Source: http://www.ebi.ac.uk/gwas/docs/file-downloads

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GWAS Catalog/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/gwas_catalog_v1.0.1-associations_e90_r2017-08-31.tsv', sep='\t', low_memory=False)

In [4]:
df.head()

Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,P-VALUE,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION
0,2009-09-28,18403759,Ober C,2008-04-09,N Engl J Med,www.ncbi.nlm.nih.gov/pubmed/18403759,Effect of variation in CHI3L1 on serum YKL-40 ...,YKL-40 levels,632 Hutterite individuals,"443 European ancestry cases, 491 European ance...",...,1e-13,13.0,,0.3,[NR] ng/ml decrease,Affymetrix [290325],N,YKL40 measurement,http://www.ebi.ac.uk/efo/EFO_0004869,GCST000177
1,2008-06-16,18369459,Liu Y,2008-04-04,PLoS Genet,www.ncbi.nlm.nih.gov/pubmed/18369459,A genome-wide association study of psoriasis a...,Psoriasis,"218 European ancestry cases, 519 European ance...","1,153 European ancestry cases, 1,217 European ...",...,2e-06,5.69897,,1.41,[1.22-1.61],Illumina [305983],N,psoriasis,http://www.ebi.ac.uk/efo/EFO_0000676,GCST000173
2,2008-06-16,18385676,Amos CI,2008-04-03,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/18385676,Genome-wide association scan of tag SNPs ident...,Lung cancer,"1,154 European ancestry cases, 1,137 European ...","2,724 European ancestry cases, 3,694 European ...",...,3e-18,17.522879,,1.3,[1.15-1.47],Illumina [317498],N,lung carcinoma,http://www.ebi.ac.uk/efo/EFO_0001071,GCST000172
3,2008-06-16,18385676,Amos CI,2008-04-03,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/18385676,Genome-wide association scan of tag SNPs ident...,Lung cancer,"1,154 European ancestry cases, 1,137 European ...","2,724 European ancestry cases, 3,694 European ...",...,7e-06,5.154902,,1.22,[1.10-1.35],Illumina [317498],N,lung carcinoma,http://www.ebi.ac.uk/efo/EFO_0001071,GCST000172
4,2008-06-16,18385676,Amos CI,2008-04-03,Nat Genet,www.ncbi.nlm.nih.gov/pubmed/18385676,Genome-wide association scan of tag SNPs ident...,Lung cancer,"1,154 European ancestry cases, 1,137 European ...","2,724 European ancestry cases, 3,694 European ...",...,8e-06,5.09691,,1.16,[1.05-1.28],Illumina [317498],N,lung carcinoma,http://www.ebi.ac.uk/efo/EFO_0001071,GCST000172


# Get Relevent Data

In [17]:
df = df[['DISEASE/TRAIT', 'MAPPED_GENE']]

In [19]:
df.head(10)

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE
0,YKL-40 levels,CHI3L1
1,Psoriasis,COG6
2,Lung cancer,HYKK
3,Lung cancer,CRPP1 - CRP
4,Lung cancer,IL1RAP
5,Lung cancer,HYKK
6,Nicotine dependence,CHRNA3
7,Colorectal cancer,CASC8
8,Colorectal cancer,"COLCA1, COLCA2"
9,Colorectal cancer,SMAD7


In [33]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if type(df.loc[index, 'MAPPED_GENE']) != float:
        if ',' in df.loc[index, 'MAPPED_GENE']:
            lst1 = df.loc[index, 'MAPPED_GENE'].split(',')
        elif '-' in df.loc[index, 'MAPPED_GENE']:
            lst1 = df.loc[index, 'MAPPED_GENE'].split('-')
        else:
            lst1 = [df.loc[index, 'MAPPED_GENE']]
        lst2 = [df.loc[index, 'DISEASE/TRAIT']]*(len(lst1))
        temp = pd.DataFrame()
        temp['Phenotype'] = lst2
        temp['Gene Name'] = lst1
        df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  50085 Out of 50085   

In [34]:
df_interactions.head(10)

Unnamed: 0,Phenotype,Gene Name
0,YKL-40 levels,CHI3L1
0,Psoriasis,COG6
0,Lung cancer,HYKK
0,Lung cancer,CRPP1
1,Lung cancer,CRP
0,Lung cancer,IL1RAP
0,Lung cancer,HYKK
0,Nicotine dependence,CHRNA3
0,Colorectal cancer,CASC8
0,Colorectal cancer,COLCA1


In [35]:
df_interactions.shape

(71968, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [36]:
df_interactions.set_index('Gene Name', inplace=True)

In [37]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  71968 Out of 71968   

# Drop Duplicates

In [38]:
df_interactions.reset_index(inplace=True)

In [39]:
df_interactions.drop_duplicates(inplace=True)

In [40]:
df_interactions.shape

(18773, 2)

# Create Binary Matrix

In [41]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  6990 Out of 6990   

In [42]:
binary_matrix.head()

Unnamed: 0,Circulating phylloquinone levels,Interleukin-2 levels,Body mass index (ever vs never smoking interaction),Cocaine dependence,Thyroid-associated orbitopathy in graves' disease,"Facial morphology (factor 10, width of nasal floor)",Childhood and early adolescence aggressive behavior,Major depressive disorder,Blood pressure measurement (cold pressor test),Vertical cup-disc ratio,...,Hepatic lipid content in extreme obesity,Ideal cardiovascular health (clinical and behavioural),Number of children (6+ vs. 0 or 1),Coronary artery disease or ischemic stroke,Body mass index (joint analysis main effects and physical activity interaction),Coronary heart disease event reduction in response to statin therapy (interaction),Spherical equivalent (joint analysis main effects and education interaction),Response to antipsychotic treatment,Temperament (bipolar disorder),"DDT metabolite (p,p'-DDE levels)"
IFNGR2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TEK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LIMCH1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SH3PXD2A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MICA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
binary_matrix.shape

(6990, 1978)

# Save Binary Matrix

In [44]:
filename = '~/./Documents/Harmonizome/GWAS Catalog/Output/gwascatalog_pheno_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [45]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GWAS Catalog/Output/'

In [46]:
name = 'gwascatalog_pheno_gene_set'

In [47]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1978 Out of 1978   

# Create Attribute Library

In [48]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GWAS Catalog/Output/'

In [49]:
name = 'gwascatalog_pheno_attribute_set'

In [50]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  6990 Out of 6990   

# Create Gene Similarity Matrix

In [51]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [52]:
gene_similarity_matix.head()

Unnamed: 0,IFNGR2,TEK,LIMCH1,SH3PXD2A,MICA,SDK2,POU1F1,CARD9,VAX2,IL1RAPL1,...,HCG23,PEBP4,CNNM2,FBXO47,SLC38A4,YTHDF3,SLBP,DIS3,NARF,MARK3
IFNGR2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
TEK,0.0,1.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LIMCH1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SH3PXD2A,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MICA,0.0,0.083333,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [53]:
filename = '~/./Documents/Harmonizome/GWAS Catalog/Output/gwascatalog_pheno_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [54]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [55]:
attribute_similarity_matix.head()

Unnamed: 0,Circulating phylloquinone levels,Interleukin-2 levels,Body mass index (ever vs never smoking interaction),Cocaine dependence,Thyroid-associated orbitopathy in graves' disease,"Facial morphology (factor 10, width of nasal floor)",Childhood and early adolescence aggressive behavior,Major depressive disorder,Blood pressure measurement (cold pressor test),Vertical cup-disc ratio,...,Hepatic lipid content in extreme obesity,Ideal cardiovascular health (clinical and behavioural),Number of children (6+ vs. 0 or 1),Coronary artery disease or ischemic stroke,Body mass index (joint analysis main effects and physical activity interaction),Coronary heart disease event reduction in response to statin therapy (interaction),Spherical equivalent (joint analysis main effects and education interaction),Response to antipsychotic treatment,Temperament (bipolar disorder),"DDT metabolite (p,p'-DDE levels)"
Circulating phylloquinone levels,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Interleukin-2 levels,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Body mass index (ever vs never smoking interaction),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cocaine dependence,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Thyroid-associated orbitopathy in graves' disease,0.0,0.0,0.0,0.0,1.0,0.166667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [56]:
filename = '~/./Documents/Harmonizome/GWAS Catalog/Output/gwascatalog_pheno_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [57]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  6990 Out of 6990   

In [58]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,IFNGR2,3460.0
1,TEK,7010.0
2,LIMCH1,22998.0
3,SH3PXD2A,9644.0
4,MICA,100507000.0


In [59]:
gene_list.shape

(6990, 2)

# Save Gene List

In [60]:
filename = '~/./Documents/Harmonizome/GWAS Catalog/Output/gwascatalog_pheno_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [61]:
attribute_list = uf.createAttributeList(binary_matrix)

In [62]:
attribute_list.head()

Unnamed: 0,Attributes
0,Circulating phylloquinone levels
1,Interleukin-2 levels
2,Body mass index (ever vs never smoking interac...
3,Cocaine dependence
4,Thyroid-associated orbitopathy in graves' disease


In [63]:
attribute_list.shape

(1978, 1)

# Save Attribute List

In [64]:
filename = '~/./Documents/Harmonizome/GWAS Catalog/Output/gwascatalog_pheno_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [65]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GWAS Catalog/Output/'

In [66]:
name = 'gwascatalog_pheno_gene_attribute_edge_list'

In [67]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1978 Out of 1978   

 The number of statisticaly relevent gene-attribute associations is: 18773
