# ClinVar

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source Home: https://www.ncbi.nlm.nih.gov/clinvar/ <br>
Data Source Download: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [5]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

This notebook was run on: 2020-06-28 
Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]


# Initialization

### Load Mapping Dictionaries

In [6]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.19s/it]


### Output Path

In [7]:
output_name = 'clinvar'

path = 'Output/ClinVar'
if not os.path.exists(path):
    os.makedirs(path)

In [8]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the ClinVar Database',
) %}

# Load Data

In [57]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.gz$',
    name='variant_summary', 
    label='Variant Summary (txt.gz)', 
    default='Input/ClinVar/variant_summary.txt.gz',
    section='data')
}}, sep='\t', usecols=
    ['GeneSymbol', 'ClinSigSimple', 'PhenotypeList', 'ReviewStatus'])

```python
df = pd.read_csv('Input/ClinVar/variant_summary.txt.gz', sep='\t', usecols=
    ['GeneSymbol', 'ClinSigSimple', 'PhenotypeList', 'ReviewStatus'])
```

In [58]:
df.head()

Unnamed: 0,GeneSymbol,ClinSigSimple,PhenotypeList,ReviewStatus
0,USP9X,1,not provided,"criteria provided, single submitter"
1,USP9X,1,not provided,"criteria provided, single submitter"
2,NONO,1,not provided,"criteria provided, single submitter"
3,NONO,1,not provided,"criteria provided, single submitter"
4,DDX3X,0,not specified,"criteria provided, single submitter"


In [59]:
df.shape

(1542521, 4)

# Pre-process Data

## Get Relevant Data

In [60]:
# Get only relevant review status
df = df[np.logical_or(
    df['ReviewStatus'] == 'reviewed by expert panel',
    df['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts'
)]
# Drop anything with clinical significance benign (ClinSigSimple = 0)
df = df[df['ClinSigSimple'] != 0]
df.shape

(234385, 4)


(48871, 4)

In [61]:
df = df[['GeneSymbol', 'PhenotypeList']]
df.head()

Unnamed: 0,GeneSymbol,PhenotypeList
78,IQSEC2,"Mental retardation, X-linked 1;not provided"
79,IQSEC2,"Mental retardation, X-linked 1;not provided"
188,EDA,Hypohidrotic X-linked ectodermal dysplasia;not...
189,EDA,Hypohidrotic X-linked ectodermal dysplasia;not...
190,IL2RG,X-linked severe combined immunodeficiency;not ...


## Split Attribute List

In [62]:
df['PhenotypeList'] = df['PhenotypeList'].map(lambda s: s.split(';'))
df = df.explode('PhenotypeList')
df.columns = ['Gene Symbol', 'Phenotype']
df = df.set_index('Gene Symbol')
df.head()

Unnamed: 0_level_0,Phenotype
Gene Symbol,Unnamed: 1_level_1
IQSEC2,"Mental retardation, X-linked 1"
IQSEC2,not provided
IQSEC2,"Mental retardation, X-linked 1"
IQSEC2,not provided
EDA,Hypohidrotic X-linked ectodermal dysplasia


# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [63]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.shape

100%|██████████| 134445/134445 [00:00<00:00, 621161.22it/s]


(8962, 1)

# Analyze Data

## Create Binary Matrix

In [23]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

Phenotype,2-3 toe syndactyly,"Abnormal aggressive, impulsive or violent behavior",Abnormality of the aortic valve,Abnormality of the tongue,Acute lymphoid leukemia,Acute myeloid leukemia,Adenocarcinoma,"Adenocarcinoma of lung, response to tyrosine kinase inhibitor in, somatic",Adenocarcinoma of prostate,Adenocarcinoma of stomach,...,rosuvastatin response - Other,sensorineural hearing loss disorder,sevoflurane response - Toxicity/ADR,simvastatin acid response - Metabolism/PK,simvastatin response - Toxicity/ADR,succinylcholine response - Toxicity/ADR,tegafur response - Toxicity/ADR,"tegafur response - Toxicity/ADR, Metabolism/PK",volatile anesthetics response - Toxicity/ADR,warfarin response - Dosage
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
APOE,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
BRAF,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
BRCA1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
BRCA2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
CACNA1S,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False


In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf(binary_matrix)
uf.save_data(edge_list, path, output_name + 'edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)