ClinVar!

## Initialize

In [1]:
import ndjson
import pandas as pd
import numpy as np

In [2]:
#import logging
#from enum import Enum
import re
#import csv
#from pathlib import Path
#import zipfile

from dotenv import load_dotenv

#from variation.query import QueryHandler

#logging.getLogger("root").setLevel(logging.WARNING)

In [131]:
file = open('/Users/rsaxs014/Downloads/output-variation_identity.ndjson')
records = ndjson.load(file)

df0 = pd.json_normalize(records)

In [176]:
df = df0.copy()

## Add Supported Status- based on in.vrs_xform_plan.policy

In [213]:
df['in.vrs_xform_plan.policy'] = df['in.vrs_xform_plan.policy'].fillna("None")

In [214]:
df["in.vrs_xform_plan.policy"].value_counts()

in.vrs_xform_plan.policy
Canonical SPDI                                      2118669
Absolute copy count                                   53263
Copy number change (cn loss|del and cn gain|dup)      27104
NCBI36 genomic only                                    4771
No hgvs or location info                               3089
Genotype/Haplotype                                     1440
Invalid/unsupported hgvs                               1336
Remaining valid hgvs alleles                            941
Min/max copy count range not supported                   14
Name: count, dtype: int64

TODO: look into using a dictionary to replace below

In [215]:
df["support_status"] = df["in.vrs_xform_plan.policy"].copy()

df.loc[df["support_status"] == "Canonical SPDI", "support_status"] = True
df.loc[df["support_status"] == "Absolute copy count", "support_status"] = True
df.loc[df["support_status"] == "Copy number change (cn loss|del and cn gain|dup)", "support_status"] = True
df.loc[df["support_status"] == "NCBI36 genomic only", "support_status"] = False
df.loc[df["support_status"] == "No hgvs or location info", "support_status"] = False
df.loc[df["support_status"] == "Genotype/Haplotype", "support_status"] = False
df.loc[df["support_status"] == "Invalid/unsupported hgvs", "support_status"] = False
df.loc[df["support_status"] == "Remaining valid hgvs alleles", "support_status"] = True
df.loc[df["support_status"] == "Min/max copy count range not supported", "support_status"] = False


In [216]:
df['support_status'].value_counts()

support_status
True     2199977
False      10650
Name: count, dtype: int64

## Add Normalization Status- based on out.errors

The errors are stored as a list of values, some of which are strings and other of which are dictionaries (determined by whether error was handled at the level of Variation Normalizer or after the normalizer)

The "get_errors" function extracts the text error responses for better readability and ease string processing

In [304]:
def get_errors(errors):
    errors_out = []
    for e in errors:
        if type(e)== str:
            errors_out.append(e)
        elif type(e)==dict:
            for k,v in e.items():
                if k not in ['msg','response-errors']: ## only get these keys from normalizer response
                    continue
                if type(v) == str:
                    errors_out.append(v)
                elif type(v) == list:
                    errors_out.append(';'.join(v))
                #errors_out.append('[NORMALIZER] '+';'.join(e.get('response-errors',[])))
    return ';'.join(errors_out)

In [305]:
df['error_string'] = df['out.errors'].fillna('').apply(get_errors)

This is the number of unique error strings

There are many different strings because many of the errors contain specific genomic coordinates, which are unlikely to occur more than once

In [306]:
df['error_string'].nunique()

61185

To get the core error message, the numeric values are replaced with "#"

In [307]:
def reduce_errors(error_string):
    out = error_string.lower()
    out = re.sub('\d+','#',out)
    return out

In [308]:
df['error_string_reduce'] = df['error_string'].apply(reduce_errors)

In [309]:
df['error_string_reduce'].value_counts()

error_string_reduce
                                                                                                                                                       2141287
variation record was too long to normalize (#), treating as text                                                                                         61464
error returned from variation normalizer;position, #, does not exist on nc_#.#                                                                            5832
error returned from variation normalizer;copy_number_count mode requires `baseline_copies`                                                                1562
unrecognized variation record                                                                                                                              431
error returned from variation normalizer;unable to tokenize g.(#_?)_(?_#)del                                                                                16
error returned from variat

There are Not Supported variants that have no error because they were labeled "Not Supported" manually.

An error ("Not Supported") is entered manually for those variants so that they are not categorized as normalized

In [310]:
df.loc[(df["support_status"] == False) & (df["error_string_reduce"] == ''), "error_string_reduce"] = "Not Supported"

The variants with no recorded error, have "error_string_reduce" string equal to the empty string and not NaN, so the empty string will be replaced with an NaN

In [311]:
df['error_string_reduce'] = df['error_string_reduce'].apply(reduce_errors).replace('',np.nan)

KeyboardInterrupt: 

In [None]:
df['error_string_reduce'].value_counts()

normalize_status
variation record was too long to normalize (#), treating as text                                                                                       61464
not supported                                                                                                                                          10200
error returned from variation normalizer;position, #, does not exist on nc_#.#                                                                          5832
error returned from variation normalizer;copy_number_count mode requires `baseline_copies`                                                              1562
unrecognized variation record                                                                                                                            431
error returned from variation normalizer;unable to tokenize g.(#_?)_(?_#)del                                                                              16
error returned from variation normalizer;

### Mark all empty errors as TRUE (for normalize status) and all variants with an error present as FALSE

In [None]:
df["normalize_status"] =  df["error_string_reduce"].isna()
df

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status
0,16098,"SLC2A2, 1-BP INS, 793C",SimpleAllele,Insertion,OMIM:138160.0009,Text,[id],No hgvs or location info,Text:clinvar:16098,Text,...,,,,,,,False,,,False
1,425693,NM_001204.6(BMPR2):c.77-?_247+?dup,SimpleAllele,Duplication,,Text,[id],No hgvs or location info,Text:clinvar:425693,Text,...,,,,,,,False,,,False
2,90650,NM_000251.2(MSH2):c.1387-?_1510+?del,SimpleAllele,Deletion,,Text,[id],No hgvs or location info,Text:clinvar:90650,Text,...,,,,,,,False,,,False
3,2446408,"CDHR1, 783G-A (rs147346345)",SimpleAllele,single nucleotide variant,OMIM:609502.0005,Text,[id],No hgvs or location info,Text:clinvar:2446408,Text,...,,,,,,,False,,,False
4,14905,"HLA-DRB1, HLA-DRB1*1101",SimpleAllele,Variation,OMIM:142857.0001,Text,[id],No hgvs or location info,Text:clinvar:14905,Text,...,,,,,,,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2210622,1464440,NM_001267550.2(TTN):c.29512_29513insGGCCGGGCGC...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:1464440,Text,...,,,,,,,False,,,False
2210623,1453116,NM_000051.4(ATM):c.3376_3377insGGCCGGGCGCGGTGG...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:1453116,Text,...,,,,,,,False,,,False
2210624,1181160,NM_030962.4(SBF2):c.55+97_55+98insCGGGCGTCGGGGC,SimpleAllele,Microsatellite,,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA.x2E2GJQA5ovZFJ1SKxg3oHzkUXv5ZGfv,Allele,...,,,,,,,True,,,True
2210625,2202105,NM_153676.4(USH1C):c.496+14_496+15insGTACTCCAT...,SimpleAllele,Microsatellite,,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA.5jQEJZIbf1ttjbIGJVhfrHR9XcB7vqWD,Allele,...,,,,,,,True,,,True


In [None]:
df["in.variation_type"].value_counts()

in.variation_type
single nucleotide variant    1935386
Deletion                      110936
Duplication                    51808
copy number loss               30643
copy number gain               29863
Microsatellite                 27160
Indel                          11504
Insertion                       9745
Inversion                       1143
Diplotype                        596
Haplotype                        565
Variation                        547
Translocation                    273
CompoundHeterozygote             249
protein only                      95
Complex                           77
Haplotype, single variant         21
Phase unknown                      8
fusion                             6
Distinct chromosomes               1
Tandem duplication                 1
Name: count, dtype: int64

In [None]:
df_text = df.loc[df['out.type'] == 'Text']

In [None]:
df_text.loc[df_text['normalize_status'] == "NaN"]

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status


##### The cells are the number of variants with each expected bahvior and how they actually ended up performing. 
##### So if a variant was in an "expected to pass" category and ends up as text, that is an instance of a normalizer failure on a supported variant

In [None]:
df[['in.id','support_status','in.vrs_xform_plan.policy','out.type']].fillna('NONE').groupby(['support_status','in.vrs_xform_plan.policy','out.type']).count().unstack(level=2).fillna(0).astype(int)

Unnamed: 0_level_0,Unnamed: 1_level_0,in.id,in.id,in.id,in.id,in.id
Unnamed: 0_level_1,out.type,Allele,CopyNumberChange,CopyNumberCount,NONE,Text
support_status,in.vrs_xform_plan.policy,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
False,Genotype/Haplotype,0,0,0,0,1440
False,Invalid/unsupported hgvs,0,0,0,5,1331
False,Min/max copy count range not supported,0,0,0,0,14
False,NCBI36 genomic only,0,0,0,0,4771
False,No hgvs or location info,0,0,0,0,3089
True,Absolute copy count,0,0,4880,1573,46810
True,Canonical SPDI,2112837,0,0,5832,0
True,Copy number change (cn loss|del and cn gain|dup),0,12424,0,31,14649
True,Remaining valid hgvs alleles,936,0,0,4,1


## Create groups of variants based on Supported and Normalized Status

In [None]:
supported_df= df.copy()

In [None]:
supported_df=supported_df.loc[(supported_df['support_status'] == True) & 
                         (supported_df['normalize_status'] == True)]
supported_df

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status
149,1676330,NM_054027.6(ANKH):c.259G>A (p.Val87Ile),SimpleAllele,single nucleotide variant,,Allele,[hgvs],Remaining valid hgvs alleles,ga4gh:VA.T3hLVZajyx7AGQKV7la2RgE9CO0Wiv2y,Allele,...,,,,,,,True,,,True
150,1676476,NM_012309.5(SHANK2):c.460C>T (p.Gln154Ter),SimpleAllele,single nucleotide variant,,Allele,[hgvs],Remaining valid hgvs alleles,ga4gh:VA.dZAOK2Sy6aJlXan36nZYUjEJpk5WX8pF,Allele,...,,,,,,,True,,,True
151,1676638,NM_000094.4(COL7A1):c.8729G>T (p.Gly2910Val),SimpleAllele,single nucleotide variant,,Allele,[hgvs],Remaining valid hgvs alleles,ga4gh:VA.aUAN4LCHVZ_CPwAQfkVuX42g_Gi6WlBu,Allele,...,,,,,,,True,,,True
152,1676567,NM_005445.4(SMC3):c.1343dup (p.Glu449fs),SimpleAllele,Duplication,,CopyNumberChange,[hgvs],Copy number change (cn loss|del and cn gain|dup),ga4gh:CX.-9T00EYMyHQKC7DVOenHi-QH8aIOcc0D,CopyNumberChange,...,,,,,,,True,,,True
153,1325687,NM_000059.4(BRCA2):c.3692C>G (p.Thr1231Ser),SimpleAllele,single nucleotide variant,,Allele,[hgvs],Remaining valid hgvs alleles,ga4gh:VA.MLNaXFddWSZRL0vRCHJceHBF09R_JgBt,Allele,...,,,,,,,True,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2210620,1496502,NM_007255.3(B4GALT7):c.881_882insTGAGGTGGATTAA...,SimpleAllele,Insertion,,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA.P-rdDkBZMl4kwW-ZHmsMOy0ik5MKa4LI,Allele,...,,,,,,,True,,,True
2210621,715720,NM_033026.6(PCLO):c.3300+7_3300+8insTTTATATATA...,SimpleAllele,Insertion,dbSNP:746054139,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA._iakrjaN6VkKr1AGuegIUY4vKZxnQVkT,Allele,...,,,,,,,True,,,True
2210624,1181160,NM_030962.4(SBF2):c.55+97_55+98insCGGGCGTCGGGGC,SimpleAllele,Microsatellite,,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA.x2E2GJQA5ovZFJ1SKxg3oHzkUXv5ZGfv,Allele,...,,,,,,,True,,,True
2210625,2202105,NM_153676.4(USH1C):c.496+14_496+15insGTACTCCAT...,SimpleAllele,Microsatellite,,Allele,[canonical_spdi],Canonical SPDI,ga4gh:VA.5jQEJZIbf1ttjbIGJVhfrHR9XcB7vqWD,Allele,...,,,,,,,True,,,True


In [None]:
supported_df['support_status'].value_counts()

support_status
True    2131087
Name: count, dtype: int64

In [None]:
supported_not_normalized_df =  df.copy()

In [None]:
supported_not_normalized_df = supported_not_normalized_df.loc[(supported_not_normalized_df['support_status'] == True) & 
                         (supported_not_normalized_df['normalize_status'] == False)]
supported_not_normalized_df

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status
166,1340674,GRCh37/hg19 7q31.33-32.1(chr7:127050634-127826...,SimpleAllele,copy number gain,,CopyNumberCount,"[hgvs, absolute_copies]",Absolute copy count,Text:clinvar:1340674,Text,...,,,,,,,True,Variation record was too long to normalize (77...,variation record was too long to normalize (#)...,False
169,1340493,GRCh37/hg19 12p12.3(chr12:17595624-18236175)x3,SimpleAllele,copy number gain,,CopyNumberCount,"[hgvs, absolute_copies]",Absolute copy count,Text:clinvar:1340493,Text,...,,,,,,,True,Variation record was too long to normalize (64...,variation record was too long to normalize (#)...,False
171,814630,GRCh37/hg19 4q33-34.1(chr4:171505226-173149981)x3,SimpleAllele,copy number gain,,CopyNumberCount,"[hgvs, absolute_copies]",Absolute copy count,Text:clinvar:814630,Text,...,,,,,,,True,Variation record was too long to normalize (16...,variation record was too long to normalize (#)...,False
172,1808660,GRCh37/hg19 2p16.3(chr2:51009232-51492867)x1,SimpleAllele,copy number loss,,CopyNumberCount,"[hgvs, absolute_copies]",Absolute copy count,Text:clinvar:1808660,Text,...,,,,,,,True,Variation record was too long to normalize (48...,variation record was too long to normalize (#)...,False
173,980505,GRCh37/hg19 3p14.2(chr3:62894703-63320384)x3,SimpleAllele,copy number gain,,CopyNumberCount,"[hgvs, absolute_copies]",Absolute copy count,Text:clinvar:980505,Text,...,,,,,,,True,Variation record was too long to normalize (42...,variation record was too long to normalize (#)...,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2210611,591342,NM_001377.3(DYNC2H1):c.8243_8244insCTAATTCTTA ...,SimpleAllele,Insertion,dbSNP:1565393163,Allele,[canonical_spdi],Canonical SPDI,591342,,...,,,,,,,True,Error returned from variation normalizer;Posit...,error returned from variation normalizer;posit...,False
2210612,1235939,NM_002693.3(POLG):c.151_152insCAGCAG (p.Gln51d...,SimpleAllele,Insertion,,Allele,[canonical_spdi],Canonical SPDI,1235939,,...,,,,,,,True,Error returned from variation normalizer;Posit...,error returned from variation normalizer;posit...,False
2210615,1801441,NM_015189.3(EXOC6B):c.2197-66917_2197-66916ins...,SimpleAllele,Insertion,,Allele,[canonical_spdi],Canonical SPDI,1801441,,...,,,,,,,True,Error returned from variation normalizer;Posit...,error returned from variation normalizer;posit...,False
2210616,491194,NM_000059.4(BRCA2):c.9501+7_9501+8insAGGTAAGGT...,SimpleAllele,Insertion,"ClinGen:CA658683823,dbSNP:1555289623",Allele,[canonical_spdi],Canonical SPDI,491194,,...,,,,,,,True,Error returned from variation normalizer;Posit...,error returned from variation normalizer;posit...,False


In [None]:
not_supported_df =  df.copy()

In [None]:
not_supported_df = not_supported_df.loc[(not_supported_df['support_status'] == False)& (not_supported_df['normalize_status'] == False)]
not_supported_df

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status
0,16098,"SLC2A2, 1-BP INS, 793C",SimpleAllele,Insertion,OMIM:138160.0009,Text,[id],No hgvs or location info,Text:clinvar:16098,Text,...,,,,,,,False,,,False
1,425693,NM_001204.6(BMPR2):c.77-?_247+?dup,SimpleAllele,Duplication,,Text,[id],No hgvs or location info,Text:clinvar:425693,Text,...,,,,,,,False,,,False
2,90650,NM_000251.2(MSH2):c.1387-?_1510+?del,SimpleAllele,Deletion,,Text,[id],No hgvs or location info,Text:clinvar:90650,Text,...,,,,,,,False,,,False
3,2446408,"CDHR1, 783G-A (rs147346345)",SimpleAllele,single nucleotide variant,OMIM:609502.0005,Text,[id],No hgvs or location info,Text:clinvar:2446408,Text,...,,,,,,,False,,,False
4,14905,"HLA-DRB1, HLA-DRB1*1101",SimpleAllele,Variation,OMIM:142857.0001,Text,[id],No hgvs or location info,Text:clinvar:14905,Text,...,,,,,,,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199348,1418992,NM_002076.4(GNS):c.841_842insTTTTTTTTTTTTTTTTT...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:1418992,Text,...,,,,,,,False,,,False
2199349,2134754,NM_152564.5(VPS13B):c.5627_5628insTTTTTTTTTTTT...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:2134754,Text,...,,,,,,,False,,,False
2210618,1513408,NM_024928.5(STN1):c.340_352AAG[2]CTACAAGGCCGGG...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:1513408,Text,...,,,,,,,False,,,False
2210622,1464440,NM_001267550.2(TTN):c.29512_29513insGGCCGGGCGC...,SimpleAllele,Insertion,,Text,[id],Invalid/unsupported hgvs,Text:clinvar:1464440,Text,...,,,,,,,False,,,False


Sanity check: making sure there are no supported variants that have been marked as normalized

In [None]:
not_supported_but_normalized_df =  df.copy()

In [None]:
not_supported_but_normalized_df = not_supported_but_normalized_df.loc[(not_supported_but_normalized_df['support_status'] == False) & 
                         (not_supported_but_normalized_df['normalize_status'] == True)]
not_supported_but_normalized_df

Unnamed: 0,in.id,in.name,in.subclass_type,in.variation_type,in.xrefs,in.vrs_xform_plan.type,in.vrs_xform_plan.inputs,in.vrs_xform_plan.policy,out.id,out.type,...,out.state.seq_expr.location.start.value,out.state.seq_expr.location.end.type,out.state.seq_expr.location.end.value,out.state.seq_expr.reverse_complement,out.state.count.type,out.state.count.value,support_status,error_string,error_string_reduce,normalize_status


In [None]:
num_supported = len(supported_df)
num_supported_not_normalized = len(supported_not_normalized_df)
num_not_supported_but_normalized = len(not_supported_but_normalized_df)
num_not_supported = len(not_supported_df)

In [None]:
print(num_supported)
print(num_supported_not_normalized)
print(num_not_supported_but_normalized)
print(num_not_supported)

2131087
68890
0
10650
