 # GWAS Catalog (SNP-Phenotype Associations)

 Author: Moshe Silverstein <br/>
 Date: 9-17 <br/>
 Data Source: http://www.ebi.ac.uk/gwas/docs/file-downloads

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import utility_functions as uf
import lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

 ### Load Mapping Dictionaries

In [6]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:08<00:00,  2.92s/it]


 ### Output Path

In [7]:
output_name = 'gwas_catalog'

path = 'Output/GWAS-CATALOG'
if not os.path.exists(path):
    os.makedirs(path)

In [8]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload files from the GWAS CAtalog Associations Data Set',
) %}

 # Load Data

In [9]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.tsv$',
    name='associations', 
    label='All Associations', 
    default='Input/GWAS-Catalog/gwas_catalog_v1.0.2-associations_e100_r2020-06-04.tsv',
    section='data')
}}, sep='\t', usecols=['DISEASE/TRAIT', 'MAPPED_GENE'])

```python

df = pd.read_csv('Input/GWAS-Catalog/gwas_catalog_v1.0.2-associations_e100_r2020-06-04.tsv', sep='\t', usecols=['DISEASE/TRAIT', 'MAPPED_GENE'])
```

In [10]:
df.head()

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE
0,Sudden cardiac arrest,FTH1P5 - AL158050.2
1,Sudden cardiac arrest,LINC02376 - AC073913.1
2,Sudden cardiac arrest,CHRNB4
3,Sudden cardiac arrest,"MARCHF10, AC005821.1"
4,Sudden cardiac arrest,CDH4


In [11]:
df.shape

(186829, 2)

 # Pre-process Data

 ## Split Gene Lists

In [12]:
df = df.dropna()
df['MAPPED_GENE'] = df['MAPPED_GENE'].str.split(pat= '; | - ')
df.head()

Unnamed: 0,DISEASE/TRAIT,MAPPED_GENE
0,Sudden cardiac arrest,"[FTH1P5, AL158050.2]"
1,Sudden cardiac arrest,"[LINC02376, AC073913.1]"
2,Sudden cardiac arrest,[CHRNB4]
3,Sudden cardiac arrest,"[MARCHF10, AC005821.1]"
4,Sudden cardiac arrest,[CDH4]


In [13]:
df = df.explode('MAPPED_GENE')
df = df.set_index('MAPPED_GENE')
df.head()

Unnamed: 0_level_0,DISEASE/TRAIT
MAPPED_GENE,Unnamed: 1_level_1
FTH1P5,Sudden cardiac arrest
AL158050.2,Sudden cardiac arrest
LINC02376,Sudden cardiac arrest
AC073913.1,Sudden cardiac arrest
CHRNB4,Sudden cardiac arrest


In [14]:
df.shape

(253030, 1)

 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [91]:
df = uf.mapgenesymbols(df, symbol_lookup)
df.shape

100%|██████████| 253030/253030 [00:00<00:00, 665767.13it/s]


(175381, 1)

 # Analyze Data

 ## Create Binary Matrix

In [95]:
binary_matrix = uf.createBinaryMatrix(df)
binary_matrix.head()

Unnamed: 0,&beta;2-Glycoprotein I (&beta;2-GPI) plasma levels,"1,5-anhydroglucitol levels",17-hydroxyprogesterone (17-OHP) levels,3-hydroxy-1-methylpropylmercapturic acid levels in smokers,3-hydroxypropylmercapturic acid levels in smokers,3-month functional outcome in ischaemic stroke (modified Rankin score),3-month functional outcome in lacunar ischaemic stroke (modified Rankin score),3-month functional outcome in non-lacunar ischaemic stroke (modified Rankin score),5-HTT brain serotonin transporter levels,6-month creatinine clearance change response to tenofovir treatment in HIV infection (treatment arm interaction),...,lower facial morphology traits (quantitative measurement),melanoma-derived growth regulatory protein levels,middle facial morphology traits (quantitative measurement),perceptual and visual search speed (trail making test A) (age interaction),same-sex sexual behaviour,vWF and FVIII levels,vWF levels,vWF levels in ischaemic stroke and hyperhomocysteinaemia,vascular endothelial growth factor D levels,visceral adipose tissue/total adipose tissue ratio
A1CF,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2ML1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2MP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
binary_matrix.shape

(17954, 4101)

In [97]:
uf.saveData(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

 ## Create Gene List

In [98]:
gene_list = uf.createGeneList(binary_matrix, geneid_lookup)
gene_list.head()

100%|██████████| 17954/17954 [00:00<00:00, 188258.04it/s]


Unnamed: 0,GeneSym,GeneID
0,A1CF,29974
1,A2M,2
2,A2ML1,144568
3,A2MP1,3
4,A4GALT,53947


In [99]:
gene_list.shape

(17954, 2)

In [100]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [101]:
attribute_list = uf.createAttributeList(binary_matrix)
attribute_list.head()

&beta;2-Glycoprotein I (&beta;2-GPI) plasma levels
"1,5-anhydroglucitol levels"
17-hydroxyprogesterone (17-OHP) levels
3-hydroxy-1-methylpropylmercapturic acid levels in smokers
3-hydroxypropylmercapturic acid levels in smokers


In [102]:
attribute_list.shape

(4101, 0)

In [103]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [104]:
uf.createUpGeneSetLib(binary_matrix, path, output_name + '_gene_up_set')

100%|██████████| 2135/2135 [00:00<00:00, 17208.93it/s]


In [105]:
uf.createUpAttributeSetLib(binary_matrix, path, 
                           output_name + '_attribute_up_set')

100%|██████████| 17954/17954 [00:00<00:00, 34317.98it/s]


 ## Create Attribute Similarity Matrix

In [106]:
attribute_similarity_matrix = uf.createSimilarityMatrix(
                                binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)