# ClinVar

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Downloaded: 08-2017 <br/>
Data Source: http://www.ncbi.nlm.nih.gov/clinvar/

In [40]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import goenrich
%matplotlib inline

In [41]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/untility_functions.py'>

# Load Data

In [42]:
df = pd.read_csv('Input/variant_summary.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
df.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
0,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4820847,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
1,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAAC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4781216,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
2,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4827379,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
3,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGC...,9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4787748,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,85342440,G,A,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (pr...",1


In [44]:
df.shape

(648289, 30)

# Get Relevent Data

### Get Data with Review Status of 'reviewed by expert panel' or 'criteria provided, multiple submitters, no conflicts,' otherwise drop

In [45]:
df[df['ReviewStatus'] == 'reviewed by expert panel'].shape

(17912, 30)

In [46]:
df[df['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts'].shape

(67319, 30)

In [47]:
df = pd.concat([df[df['ReviewStatus'] == 'reviewed by expert panel'], df[df['ReviewStatus'] == 'criteria provided, multiple submitters, no conflicts']])

In [48]:
df.shape

(85231, 30)

### Drop all data with ClinSigSimple of 0

In [49]:
df = df[['GeneSymbol', 'PhenotypeList', 'ClinicalSignificance']]

In [50]:
df.replace(0, np.nan, inplace=True)

In [51]:
df.dropna(how="any", inplace=True)

### Get gene symbol and phenotype 

In [52]:
df = df[['GeneSymbol', 'PhenotypeList']]

In [53]:
df.head()

Unnamed: 0,GeneSymbol,PhenotypeList
10740,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
10741,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
10742,DPYD,Dihydropyrimidine dehydrogenase deficiency;Flu...
11265,NAT2,Slow acetylator due to N-acetyltransferase enz...
11266,NAT2,Slow acetylator due to N-acetyltransferase enz...


In [54]:
df.drop_duplicates(inplace=True)

In [55]:
df.shape

(7363, 2)

In [56]:
df_interactions = pd.DataFrame()

for i, index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    
    lst2 = df.ix[index, 'PhenotypeList'].split(';')
    lst1 = [df.ix[index, 'GeneSymbol']]*(len(lst2))
    temp = pd.DataFrame()
    temp['GeneSymbol'] = lst1
    temp['Phenotype'] = lst2
    df_interactions = pd.concat([df_interactions, temp]) 

Progeres: 0%  59 Out of 7363   

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Progeres: 100%  7363 Out of 7363   

In [57]:
df_interactions.head()

Unnamed: 0,GeneSymbol,Phenotype
0,DPYD,Dihydropyrimidine dehydrogenase deficiency
1,DPYD,Fluorouracil response
2,DPYD,Hirschsprung disease 1
3,DPYD,Pyrimidine analogues response - Toxicity/ADR
4,DPYD,capecitabine response - Toxicity/ADR


In [58]:
df_interactions.shape

(20930, 2)

# Drop all "not provided" data

In [59]:
df_interactions.replace('not provided', np.nan, inplace=True)

In [60]:
df_interactions.dropna(how='any', inplace=True)

In [61]:
df_interactions.shape

(18317, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [62]:
df_interactions.set_index('GeneSymbol', inplace=True)

In [63]:
uf.mapgenesymbols(df_interactions)

Progeres: 100%  18317 Out of 18317   

In [64]:
df_interactions.reset_index(inplace=True)

In [65]:
df_interactions.drop_duplicates(inplace=True)

In [66]:
df_interactions.shape

(6663, 2)

# Create Binary Matrix

In [67]:
binary_matrix = uf.createBinaryMatix(df_interactions)

Progeres: 100%  1952 Out of 1952   

In [68]:
binary_matrix.head()

Unnamed: 0,"Mental retardation, autosomal recessive 13",Atrioventricular septal defect 4,Congenital muscular dystrophy-dystroglycanopathy with brain and eye anomalies type A5,"Dementia, Deafness, and Sensory Neuropathy",Adult hypophosphatasia,"Glycogen storage disease, type VII",Isolated GnRH Deficiency,Familial cold urticaria,"PI, M1A",Congenital disorder of glycosylation type 1C,...,Aplastic anemia,Autoinflammation with infantile enterocolitis,"Vasculopathy, retinal, with cerebral leukodystrophy",Congenital heart disease,"Limb-girdle muscular dystrophy-dystroglycanopathy, type C1","Myopathy, RYR1-associated",Microcephalic osteodysplastic primordial dwarfism type 2,Dubin-Johnson syndrome,amisulpride response - Toxicity/ADR,"Pulmonary disease, chronic obstructive, susceptibility to"
ATP2C1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRKCA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MERTK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RFXANK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NTRK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
binary_matrix.shape

(1952, 2934)

# Save Binary Matrix

In [70]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [71]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [72]:
name = 'clinvar_gene_set'

In [73]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  2934 Out of 2934   

# Create Attribute Library

In [74]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [75]:
name = 'clinvar_attribute_set'

In [76]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  1952 Out of 1952   

# Create Gene Similarity Matrix

In [77]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [78]:
gene_similarity_matix.head()

Unnamed: 0,ATP2C1,PRKCA,MERTK,RFXANK,NTRK1,CTDP1,CRYBB3,GAMT,LAMC3,MMP13,...,PRKCG,ADAM9,CSPP1,WWOX,RHO,ZMYND11,PCCB,PUS1,ALPK3,ARG1
ATP2C1,1.0,0.0,0.333333,0.333333,0.333333,0.5,0.333333,0.25,0.5,0.0,...,0.333333,0.333333,0.333333,0.2,0.25,0.0,0.333333,0.333333,0.333333,0.333333
PRKCA,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MERTK,0.333333,0.0,1.0,0.333333,0.333333,0.5,0.333333,0.25,0.5,0.0,...,0.333333,0.333333,0.333333,0.2,0.25,0.0,0.333333,0.333333,1.0,0.333333
RFXANK,0.333333,0.0,0.333333,1.0,0.333333,0.5,0.333333,0.25,0.5,0.0,...,0.333333,0.333333,0.333333,0.2,0.25,0.0,0.333333,0.333333,0.333333,0.333333
NTRK1,0.333333,0.0,0.333333,0.333333,1.0,0.5,0.333333,0.25,0.5,0.0,...,0.333333,0.333333,0.333333,0.2,0.25,0.0,0.333333,0.333333,0.333333,0.333333


# Save Gene Similarity Matrix

In [79]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [80]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [81]:
attribute_similarity_matix.head()

Unnamed: 0,"Mental retardation, autosomal recessive 13",Atrioventricular septal defect 4,Congenital muscular dystrophy-dystroglycanopathy with brain and eye anomalies type A5,"Dementia, Deafness, and Sensory Neuropathy",Adult hypophosphatasia,"Glycogen storage disease, type VII",Isolated GnRH Deficiency,Familial cold urticaria,"PI, M1A",Congenital disorder of glycosylation type 1C,...,Aplastic anemia,Autoinflammation with infantile enterocolitis,"Vasculopathy, retinal, with cerebral leukodystrophy",Congenital heart disease,"Limb-girdle muscular dystrophy-dystroglycanopathy, type C1","Myopathy, RYR1-associated",Microcephalic osteodysplastic primordial dwarfism type 2,Dubin-Johnson syndrome,amisulpride response - Toxicity/ADR,"Pulmonary disease, chronic obstructive, susceptibility to"
"Mental retardation, autosomal recessive 13",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atrioventricular septal defect 4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Congenital muscular dystrophy-dystroglycanopathy with brain and eye anomalies type A5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Dementia, Deafness, and Sensory Neuropathy",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adult hypophosphatasia,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [82]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [83]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  1952 Out of 1952   

In [84]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ATP2C1,27032
1,PRKCA,5578
2,MERTK,10461
3,RFXANK,8625
4,NTRK1,4914


In [85]:
gene_list.shape

(1952, 2)

# Save Gene List

In [86]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [87]:
attribute_list = uf.createAttributeList(binary_matrix)

In [88]:
attribute_list.head()

Unnamed: 0,Attributes
0,"Mental retardation, autosomal recessive 13"
1,Atrioventricular septal defect 4
2,Congenital muscular dystrophy-dystroglycanopat...
3,"Dementia, Deafness, and Sensory Neuropathy"
4,Adult hypophosphatasia


In [89]:
attribute_list.shape

(2934, 1)

# Save Attribute List

In [90]:
filename = '~/./Documents/Harmonizome/ClinVar/Output/clinvar_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [91]:
path = '/Users/moshesilverstein/Documents/Harmonizome/ClinVar/Output/'

In [92]:
name = 'clinvar_gene_attribute_edge_list'

In [93]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  2934 Out of 2934   

 The number of statisticaly relevent gene-attribute associations is: 6663
