In [1]:
# Load environment variables. This MUST be the first cell.
from dotenv import load_dotenv

load_dotenv("../../.env.shared")

True

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
import ndjson
import re
import gzip
import os
import sys

analysis_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(analysis_dir)

from utils import get_errors  # noqa: E402

In [3]:
pd.set_option("display.max_columns", 200)

In [4]:
### Load normalizer output for ClinVar variants located in the "clinvar" directory of this repo

with gzip.open("../clinvar/vi-normalized-with-liftover.jsonl.gz", "rb") as f:
    records = ndjson.load(f)

In [5]:
len(records)

3757467

In [6]:
## read ClinVar normalizer responses into a pandas dataframe for analysis

batch_size = 100000
n_batches = len(records) // batch_size + 1

df0 = pd.concat(
    [
        pd.json_normalize(records[k * batch_size : (k + 1) * batch_size])
        for k in tqdm(range(n_batches))
    ]
)

  0%|          | 0/38 [00:00<?, ?it/s]

In [7]:
### re-run to re-initialize the ClinVar dataframe without re-running the above cell to read in the full dataframe
df = df0.copy()

#  Identify CNVs in ClinVar

In [8]:
### different predicted behavior of variants based on variant type/what information provided by ClinVar
df["in.vrs_class"].value_counts()

in.vrs_class
Allele              3675021
CopyNumberCount       44901
CopyNumberChange      34074
Unknown                1713
Not Available          1112
Haplotype               646
Name: count, dtype: int64

### Restrict to copy number variants. 
ClinVar variant types were further specified and identified in the ```in.vrs_class``` field before running the normalizer on ClinVar. 
For our analysis, CNVs are variants with ```in.vrs_class``` equal to one of  
* ```CopyNumberCount```
* ```CopyNumberChange```

In [9]:
df = df[df["in.vrs_class"].str.lower().str.contains("copy")].copy()

In [10]:
len(df)

78975

### Remove variants which failed to normalize

In [11]:
### errors stored as a list of values, some of which are strings and other of which are dictionaries (determined by whether error was handled at the level of Variation Normalizer or after the normalizer)
### this function extracts the text error responses for better readability and ease string processing
### to get the core error message, we'll simply strip all the numeric values out and replace them with "#"
def reduce_errors(error_string: str) -> str:
    out = error_string.lower()
    out = re.sub(r"\d+", "#", out)
    return out

In [12]:
### apply the error extraction function to our dataframe's column containing the errors
df["error_string"] = df["out.errors"].fillna("").apply(get_errors)
df["error_string_reduce"] = df["error_string"].apply(reduce_errors)

### Remove variants which failed to normalize 
Due to
* liftover error
* classification error
* tokenization error
* not supported by normalizer

In [13]:
df = df[df["error_string_reduce"] == ""].copy()

In [14]:
len(df)

71730

# Calculate start/stop positions to compare to NCH variants
### What start/stop position fields are available
* ```out.subject.start``` is the normalized start value when the start is specified (similarly for ```end```)
* If the start is a range, we extract values from the range (similarly for ```end```)

### Our policy for choosing start/stop positions: Prioritize the absolute start and stop position (if known), otherwise inner, then outer 
* choose the specified normalized start value when available; otherwise the inner start (i.e. the max start); otherwise the outer start (i.e. the min start)
* similarly, choose the normalized end value when available; otherwise the inner stop (i.e. the max stop); otherwise the outer stop (i.e. the max stop)

In [15]:
def get_location_interval_endpoint(
    loc_interval: int | list, 
    endpoint=max #max or min
) -> int:
    """Extract the location interval endpoint from a provided location
    
    :param loc_interval: An integer or list representing the interval
    :param endpoint: A callable describing if the max or min from the interval
        should be used
    :return: An integer representing the location interval endpoint
    """
    if type(loc_interval) is int:
        return loc_interval
    if type(loc_interval) is list:
        return endpoint(set(loc_interval) - {None})

In [16]:
df["start_38"] = df["out.location.start"].apply(
    lambda loc_interval: get_location_interval_endpoint(loc_interval, endpoint=max)
)

df["stop_38"] = df['out.location.end'].apply(
    lambda loc_interval: get_location_interval_endpoint(loc_interval, endpoint=min)
)

In [17]:
### Sanity check: Confirm that all variants are able to populate the start/stop position fields using our chosen preference rule

df[(df["start_38"].isna()) | (df["stop_38"].isna())][
    [
        "in.assembly_version",
        "out.location.start",
        "out.location.end",
        "in.range_copies",
        "in.absolute_copies",
        "out.copies"
    ]
]

Unnamed: 0,in.assembly_version,out.location.start,out.location.end,in.range_copies,in.absolute_copies,out.copies


### Restrict to CNVs with calculated location information

In [18]:
df.head()

Unnamed: 0,out,in.variation_id,in.name,in.vrs_class,in.range_copies,in.issue,in.variation_type,in.subclass_type,in.cytogenetic,in.mappings,in.accession,in.fmt,in.source,in.precedence,out.errors,in.copy_change_type,in.assembly_version,in.chr,in.variant_length,out.id,out.type,out.digest,out.location.id,out.location.type,out.location.digest,out.location.sequenceReference.type,out.location.sequenceReference.refgetAccession,out.location.start,out.location.end,out.copyChange,out.state.type,out.state.length,out.state.sequence,out.state.repeatSubunitLength,out.extensions,in.absolute_copies,out.copies,error_string,error_string_reduce,start_38,stop_38
30,,3247636,NC_000001.10:g.(?_100381624)_(100382277_?)del,CopyNumberChange,[],,Deletion,SimpleAllele,1p21.2,[],NC_000001.10,hgvs,NC_000001.10:g.(?_100381624)_(100382277_?)del,2,,loss,37,1,654,ga4gh:CX.ZuUeYk-wKkSruFZ3e2dh_UXdF_Z26bS7,CopyNumberChange,ZuUeYk-wKkSruFZ3e2dh_UXdF_Z26bS7,ga4gh:SL.cy-yXgOqImqYmkZASlKTDG-UKBs0B2mW,SequenceLocation,cy-yXgOqImqYmkZASlKTDG-UKBs0B2mW,SequenceReference,SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO,"[None, 99916067]","[99916721, None]",loss,,,,,,,,,,99916067,99916721
32,,2425513,NC_000001.10:g.(?_153963273)_(154580482_?)del,CopyNumberChange,[],,Deletion,SimpleAllele,1q21.3,[],NC_000001.10,hgvs,NC_000001.10:g.(?_153963273)_(154580482_?)del,2,,loss,37,1,617210,ga4gh:CX.mmX_J3BieQI7iYl1G3bfQpvZvx-AMusn,CopyNumberChange,mmX_J3BieQI7iYl1G3bfQpvZvx-AMusn,ga4gh:SL.PlrS9sDMxKG1Z9YVW9eRrWmT_cFXU7f_,SequenceLocation,PlrS9sDMxKG1Z9YVW9eRrWmT_cFXU7f_,SequenceReference,SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO,"[None, 153990796]","[154608006, None]",loss,,,,,,,,,,153990796,154608006
33,,1067607,NC_000001.10:g.(?_156104586)_(156108907_?)del,CopyNumberChange,[],,Deletion,SimpleAllele,1q22,[],NC_000001.10,hgvs,NC_000001.10:g.(?_156104586)_(156108907_?)del,2,,loss,37,1,4322,ga4gh:CX.8F4hrGUG68YFpcJCW2Aj-iKpxwbKXbG2,CopyNumberChange,8F4hrGUG68YFpcJCW2Aj-iKpxwbKXbG2,ga4gh:SL.AAxRpLgK_6iqDk7i4_XNWRxh04YhVIij,SequenceLocation,AAxRpLgK_6iqDk7i4_XNWRxh04YhVIij,SequenceReference,SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO,"[None, 156134794]","[156139116, None]",loss,,,,,,,,,,156134794,156139116
34,,3247983,NC_000001.10:g.(?_169446833)_(169454047_?)del,CopyNumberChange,[],,Deletion,SimpleAllele,1q24.2,[],NC_000001.10,hgvs,NC_000001.10:g.(?_169446833)_(169454047_?)del,2,,loss,37,1,7215,ga4gh:CX.soEykkuwN3ddRcRc2N4G7VvKCJqym8w9,CopyNumberChange,soEykkuwN3ddRcRc2N4G7VvKCJqym8w9,ga4gh:SL.E-GzdBhCBCXrzEOH2Jh9A5zhKdlr0hKa,SequenceLocation,E-GzdBhCBCXrzEOH2Jh9A5zhKdlr0hKa,SequenceReference,SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO,"[None, 169477594]","[169484809, None]",loss,,,,,,,,,,169477594,169484809
35,,3247768,NC_000001.10:g.(?_183538257)_(183543776_?)del,CopyNumberChange,[],,Deletion,SimpleAllele,1q25.3,[],NC_000001.10,hgvs,NC_000001.10:g.(?_183538257)_(183543776_?)del,2,,loss,37,1,5520,ga4gh:CX.wgUo3HPmYiknojW5Hx6w_b-wZKquqOGB,CopyNumberChange,wgUo3HPmYiknojW5Hx6w_b-wZKquqOGB,ga4gh:SL.MAXLYdyIQHFsrwpEuP4pJOo7wwi0PDOu,SequenceLocation,MAXLYdyIQHFsrwpEuP4pJOo7wwi0PDOu,SequenceReference,SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO,"[None, 183569121]","[183574641, None]",loss,,,,,,,,,,183569121,183574641


In [19]:
## restrict dataframe to minimal necessary fields, clean up column names for export and downstream analyses

df = df[
    [
        "in.variation_id",
        "in.name",
        "in.variation_type",
        "in.assembly_version",
        "in.chr",
        "in.cytogenetic",
        "start_38",
        "stop_38",
        "in.range_copies",
        "in.absolute_copies",
        "out.copies"
    ]
]
df = df.rename(columns={c: c.split(".")[-1] for c in df.columns})
df

Unnamed: 0,variation_id,name,variation_type,assembly_version,chr,cytogenetic,start_38,stop_38,range_copies,absolute_copies,copies
30,3247636,NC_000001.10:g.(?_100381624)_(100382277_?)del,Deletion,37,1,1p21.2,99916067,99916721,[],,
32,2425513,NC_000001.10:g.(?_153963273)_(154580482_?)del,Deletion,37,1,1q21.3,153990796,154608006,[],,
33,1067607,NC_000001.10:g.(?_156104586)_(156108907_?)del,Deletion,37,1,1q22,156134794,156139116,[],,
34,3247983,NC_000001.10:g.(?_169446833)_(169454047_?)del,Deletion,37,1,1q24.2,169477594,169484809,[],,
35,3247768,NC_000001.10:g.(?_183538257)_(183543776_?)del,Deletion,37,1,1q25.3,183569121,183574641,[],,
...,...,...,...,...,...,...,...,...,...,...,...
89476,59269,GRCh38/hg38 Xp11.4-11.3(chrX:41823849-44240337)x1,copy number loss,38,X,Xp11.4-11.3,41823848,44240337,[],1,1.0
89477,148019,GRCh38/hg38 Xq13.3-28(chrX:75086417-156022206)x1,copy number loss,38,X,Xq13.3-28,75086416,156022206,[],1,1.0
89478,59236,GRCh38/hg38 Xp22.31-22.2(chrX:9540020-13128124)x1,copy number loss,38,X,Xp22.31-22.2,9540019,13128124,[],1,1.0
89479,2579183,GRCh38/hg38 Xq24(chrX:119582581-119589158)x0,copy number loss,38,X,Xq24,119582580,119589158,[],0,0.0


In [20]:
df.to_csv("cnv_data/ClinVar-CNVs-normalized.csv.gzip", index=False, compression="gzip")