# ClinVar

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Downloaded: 08-2017 <br/>
Data Source: http://www.ncbi.nlm.nih.gov/clinvar/

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/variant_summary.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
0,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4820847,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
1,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4781216,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
2,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4827379,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
3,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4787748,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,85342440,G,A,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (pr...",1


In [5]:
df.shape

(648289, 30)

# Get Relevent Data

In [6]:
df[df['ReviewStatus'] == 'reviewed by expert panel'].shape

(17912, 30)

In [7]:
df[df['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts'].shape

(67319, 30)

In [8]:
df = pd.concat([df[df['ReviewStatus'] == 'reviewed by expert panel'], df[df['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts']])

In [9]:
df.shape

(85231, 30)

In [10]:
df = df[['GeneSymbol', 'PhenotypeList']]

In [11]:
df.head()

Unnamed: 0,GeneSymbol,PhenotypeList
10740,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
10741,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
10742,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
11265,NAT2,Slow acetylator due to N-acetyltransferase enz...
11266,NAT2,Slow acetylator due to N-acetyltransferase enz...


In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.shape

(7363, 2)

In [14]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 'PhenotypeList'].split(';')
    lst1 = [df.ix[index, 'GeneSymbol']]*(len(lst2)-1)
    lst2.pop(0)
    temp = pd.DataFrame()
    temp['GeneSymbol'] = lst1
    temp['Phenotype'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 100%  7363 Out of 7363   

In [15]:
df_interactions.head()

Unnamed: 0,GeneSymbol,Phenotype
0,DPYD,Fluorouracil response
1,DPYD,Hirschsprung disease 1
2,DPYD,Pyrimidine analogues response - Toxicity/ADR
3,DPYD,capecitabine response - Toxicity/ADR
4,DPYD,fluorouracil response - Toxicity/ADR


In [16]:
df_interactions.shape

(13567, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [17]:
df_interactions.set_index('GeneSymbol', inplace=True)

In [18]:
mf.mapgenesymbols(df_interactions)

Progeres: 100%  13567 Out of 13567   

In [19]:
df_interactions.reset_index(inplace=True)

In [20]:
df_interactions.drop_duplicates(inplace=True)

In [21]:
df_interactions.shape

(5208, 2)

# Create Binary Matrix

In [22]:
binary_matrix = mf.createBinaryMatix(df_interactions)

Progeres: 100%  1786 Out of 1786   

In [23]:
binary_matrix.head()

Unnamed: 0,Tyrosinase-negative oculocutaneous albinism,"Keratosis, seborrheic",Renal-hepatic-pancreatic dysplasia,Multiple mitochondrial dysfunctions syndrome,Glomerulopathy,Familial febrile seizures 8,Congenital atresia of mitral valve,Rippling muscle disease,"Transient Neonatal Diabetes, Dominant",Noonan syndrome 4,...,Juvenile hemochromatosis,Cardiac arrhythmia,Dyskeratosis congenita autosomal dominant,"Thrombocytopenia, X-linked","Paget disease of bone, familial",atazanavir and ritonavir response - Toxicity/ADR,Short QT Syndrome 4,"Carnitine palmitoyltransferase II deficiency, myopathic, stress-induced","Arrhythmogenic right ventricular cardiomyopathy, type 10",Waardenburg syndrome
SRCAP,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COG8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGM1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LDB3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SZT2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
binary_matrix.shape

(1786, 1764)

# Save Binary Matrix

In [25]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [26]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [27]:
name = 'clinvar_gene_set'

In [28]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1764 Out of 1764   

# Create Attribute Library

In [29]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [30]:
name = 'clinvar_attribute_set'

In [31]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  1786 Out of 1786   

# Create Gene Similarity Matrix

In [32]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [33]:
gene_similarity_matix.head()

Unnamed: 0,SRCAP,COG8,TGM1,LDB3,SZT2,MFN2,ADRB2,FANCA,COL11A2,NSD1,...,DLX3,CHRND,CHRNA1,RPL5,PYGM,KLF1,NDUFAF6,SLC40A1,TTBK2,PAK3
SRCAP,1.0,0.5,0.666667,0.333333,0.5,0.090909,0.0,0.666667,0.285714,0.105263,...,0.5,0.333333,0.333333,0.5,1.0,0.5,0.5,0.333333,1.0,0.5
COG8,0.5,1.0,0.333333,0.166667,1.0,0.045455,0.0,0.333333,0.142857,0.052632,...,1.0,0.5,0.5,1.0,0.5,1.0,1.0,0.5,0.5,0.0
TGM1,0.666667,0.333333,1.0,0.285714,0.333333,0.086957,0.0,0.5,0.25,0.1,...,0.333333,0.25,0.25,0.333333,0.666667,0.333333,0.333333,0.25,0.666667,0.333333
LDB3,0.333333,0.166667,0.285714,1.0,0.166667,0.076923,0.0,0.285714,0.181818,0.086957,...,0.166667,0.142857,0.142857,0.166667,0.333333,0.166667,0.166667,0.142857,0.333333,0.166667
SZT2,0.5,1.0,0.333333,0.166667,1.0,0.045455,0.0,0.333333,0.142857,0.052632,...,1.0,0.5,0.5,1.0,0.5,1.0,1.0,0.5,0.5,0.0


# Save Gene Similarity Matrix

In [34]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [35]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [36]:
attribute_similarity_matix.head()

Unnamed: 0,Tyrosinase-negative oculocutaneous albinism,"Keratosis, seborrheic",Renal-hepatic-pancreatic dysplasia,Multiple mitochondrial dysfunctions syndrome,Glomerulopathy,Familial febrile seizures 8,Congenital atresia of mitral valve,Rippling muscle disease,"Transient Neonatal Diabetes, Dominant",Noonan syndrome 4,...,Juvenile hemochromatosis,Cardiac arrhythmia,Dyskeratosis congenita autosomal dominant,"Thrombocytopenia, X-linked","Paget disease of bone, familial",atazanavir and ritonavir response - Toxicity/ADR,Short QT Syndrome 4,"Carnitine palmitoyltransferase II deficiency, myopathic, stress-induced","Arrhythmogenic right ventricular cardiomyopathy, type 10",Waardenburg syndrome
Tyrosinase-negative oculocutaneous albinism,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Keratosis, seborrheic",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Renal-hepatic-pancreatic dysplasia,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Multiple mitochondrial dysfunctions syndrome,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Glomerulopathy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [37]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [38]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  1786 Out of 1786   

In [39]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,SRCAP,10847
1,COG8,84342
2,TGM1,7051
3,LDB3,11155
4,SZT2,23334


In [40]:
gene_list.shape

(1786, 2)

# Save Gene List

In [41]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [42]:
attribute_list = mf.createAttributeList(binary_matrix)

In [43]:
attribute_list.head()

Unnamed: 0,Attributes
0,Tyrosinase-negative oculocutaneous albinism
1,"Keratosis, seborrheic"
2,Renal-hepatic-pancreatic dysplasia
3,Multiple mitochondrial dysfunctions syndrome
4,Glomerulopathy


In [44]:
attribute_list.shape

(1764, 1)

# Save Attribute List

In [45]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [46]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [47]:
name = 'clinvar_gene_attribute_edge_list'

In [48]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1764 Out of 1764   

 The number of statisticaly relevent gene-attribute associations is: 5208
