In [1]:
# Load environment variables. This MUST be the first cell.
from dotenv import load_dotenv

load_dotenv("../../.env.shared")

True

In [2]:
import json  # noqa: E402
import re  # noqa: E402
import logging

import pandas as pd  # noqa: E402
from tqdm.notebook import tqdm  # noqa: E402
from variation.main import parsed_to_cn_var  # noqa: E402
from variation.schemas.copy_number_schema import ParsedToCnVarQuery  # noqa: E402

log_filename = "prep_normalize_cnv.log"
logging.basicConfig(
    filename=log_filename,
    format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)

  import pkg_resources


***Using Gene Database Endpoint: http://localhost:8000***


In [3]:
# Download cool-seq-tool files. This must be done before initializing QueryHandler
import os 
import sys

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)
from analysis.download_cool_seq_tool_files import download_cool_seq_tool_files  # noqa: E402

download_cool_seq_tool_files(is_docker_env=False)

In [4]:
pd.set_option("display.max_columns", 100)

In [5]:
MANUSCRIPT_S3_URL = "https://nch-igm-wagner-lab-public.s3.us-east-2.amazonaws.com/variation-normalizer-manuscript/2025"

In [6]:
# Download NCH microarray CNV data from s3 bucket

from pathlib import Path
import requests

def download_s3(url: str, outfile_path: Path) -> None:
    """Download objects from public s3 bucket

    :param url: URL for file in s3 bucket
    :param outfile_path: Path where file should be saved
    """
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(outfile_path, "wb") as h:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    h.write(chunk)
path = "cnv_data"
Path(path).mkdir(exist_ok=True)


url = f"{MANUSCRIPT_S3_URL}/cnv_data/NCH-microarray-CNVs.csv"
outfile_path = Path(f"{path}/NCH-microarray-CNVs.csv")
download_s3(url, outfile_path)

In [7]:
### Load NCH microarray CNV data
da = pd.read_csv("cnv_data/NCH-microarray-CNVs.csv")

In [8]:
da.head()

Unnamed: 0,variant,build
0,7q35(146581450_146661741)x3,GRCh37
1,22q11.21q11.23(21798705_23751189)x3,GRCh37
2,18p11.32(146484_408044)x1,GRCh37
3,3q25.1(150353937_150944378)x1,GRCh37
4,16p12.1(27864452_27995317)x3,GRCh37


In [9]:
len(da)

19106

### Remove duplicates

In [10]:
da = da.drop_duplicates("variant")
len(da)

14710

## Compute and check chromosome values

In [11]:
da["chromosome"] = da["variant"].apply(lambda v: re.findall(r"([\dXY]+)[pq]", v)[0])

In [12]:
da["chromosome"].unique()

array(['7', '22', '18', '3', '16', 'X', '2', '17', '1', '20', '5', '6',
       '9', '14', '11', '10', '13', '4', '8', '12', '21', '15', '19', 'Y',
       '1919'], dtype=object)

In [13]:
# clean incorrect values
da["chromosome"] = da["chromosome"].replace("1919", "19")

## Compute start/stop values

In [14]:
da["start"] = (
    da["variant"]
    .apply(lambda v: re.findall(r"\(([\d,]+)", v)[0].replace(",", ""))
    .astype(int)
)

In [15]:
da["stop"] = (
    da["variant"]
    .apply(lambda v: re.findall(r"([\d,]+)\)", v)[-1].replace(",", ""))
    .astype(int)
)

### Identify errors in start/stop values

In [16]:
da["var_len"] = da["stop"] - da["start"] + 1

In [17]:
def count_positions(v: str) -> int:
    """given an ISCN nomenclature string, returns the number of numerical positions detected (expected is 2 per the nomenclature)"""
    numranges = re.findall(r"\([\d,\-\_]+\)", v)[0]
    numranges = re.sub(",", "", numranges)
    return len(re.findall(r"\d+", numranges))


da["#positions"] = da["variant"].apply(count_positions)
if list(da["#positions"].unique()) == [1]:
    del da["#positions"]

#### Remove malformed variants: incorrect number of positions
(these all appear to data entry errors; not all have a clear resolution, so we will drop them from this analysis)

In [18]:
da[(da["#positions"] != 2)]

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#positions
5251,"13q31.3(90,653,47091,471,740)x3",GRCh38,13,9065347091471740,9065347091471740,1,1
5802,"15q21.3(55,499,672,55,593,588)x1",GRCh38,15,5549967255593588,5549967255593588,1,1
5940,"1p36.33p34.3(1,341,185-35,888,147-35,888,147)x...",GRCh38,1,1341185,35888147,34546963,3
12101,"1p33(47,070,474-47,282-269)x3",GRCh37,1,47070474,269,-47070204,3
13757,"1q21.1(144,998,070-146,179-215)x1",GRCh37,1,144998070,215,-144997854,3
15978,"12q13.12(49,370,149-49-678,707)x3",GRCh38,12,49370149,678707,-48691441,3
16842,"20q13.2q13.32(52,096,110-57-389,480)x2 hmz",GRCh38,20,52096110,389480,-51706629,3
17398,"8q24.21q24.22(129,951,769-133-621,197)x2 hmz",GRCh38,8,129951769,621197,-129330571,3


In [19]:
da = da[(da["#positions"] == 2)].copy()
len(da)

14702

#### Remove Malformed variants: start > stop
(these all appear to be data entry errors; not all have a clear resolution, so we will drop them from this analysis)

In [20]:
da[(da["var_len"] < 0)].drop_duplicates("variant")

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#positions
4295,"6q22.1q23.2(1117,662,682-134,504,204)x2 hmz",GRCh38,6,1117662682,134504204,-983158477,2
4963,"6q16.1(95,032,6225-95,297,694)x3",GRCh38,6,950326225,95297694,-855028530,2
6081,10q22.3(793155275_79370971)x1,GRCh37,10,793155275,79370971,-713784303,2
6192,22q13.31q13.3(345611578_51178150)x1,GRCh37,22,345611578,51178150,-294433427,2
8304,9p24.1(9014741_5165768)x1,GRCh37,9,9014741,5165768,-3848972,2
10562,"9q22.2(914,974,185-92,471,302)x1",GRCh37,9,914974185,92471302,-822502882,2
13673,"15q26.3(98,111,496-97,413,981)x3",GRCh37,15,98111496,97413981,-697514,2
14905,"2q34(212,681,263-21,889,508)x1",GRCh37,2,212681263,21889508,-190791754,2
14938,"8q11.1q11.21(47,042,437-48,401,60)x3",GRCh37,8,47042437,4840160,-42202276,2
15141,"5q35.3(180,380,290-100,448,046)x3",GRCh37,5,180380290,100448046,-79932243,2


In [21]:
da = da[da["var_len"] >= 0].copy()
len(da)

14683

## Compute copy number/ranges for each variant

In [22]:
da["copy_number"] = da["variant"].apply(
    lambda v: re.findall(r"\) ?x?[ ]?([\d~\-]*)", v)[0]
)
da["copy_number_min"] = da["copy_number"].apply(lambda x: int(x[0]))
da["copy_number_max"] = da["copy_number"].apply(lambda x: int(x[-1]))

### Check copy number values

In [23]:
sorted(set(da["copy_number"]))

['0',
 '0-2',
 '0~1',
 '1',
 '1-2',
 '1~2',
 '1~3',
 '2',
 '2-3',
 '2~3',
 '3',
 '3~4',
 '4',
 '4~5',
 '5~6']

In [24]:
sorted(set(da["copy_number_min"]))

[0, 1, 2, 3, 4, 5]

In [25]:
sorted(set(da["copy_number_max"]))

[0, 1, 2, 3, 4, 5, 6]

### Regions of Homozygosity ("hmz") variants not to be considered in this analysis
#### How many variants in our data with CN=2 are autosomal and not regions of homozygosity?

In [26]:
da["region_of_homozygosity"] = da["variant"].str.contains("hmz")

In [27]:
da["region_of_homozygosity"].value_counts()

region_of_homozygosity
False    9017
True     5666
Name: count, dtype: int64

In [28]:
da = da[~da["region_of_homozygosity"]].copy()
len(da)

9017

In [29]:
da[(da["copy_number"] == "2") & (~da["chromosome"].isin(["X", "Y"]))]

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#positions,copy_number,copy_number_min,copy_number_max,region_of_homozygosity
151,2p13.2p12(73227186_79042754)x2,GRCh37,2,73227186,79042754,5815569,2,2,2,2,False
4144,"7p22.2p12.3(3,264,728-46,071,829)x2",GRCh38,7,3264728,46071829,42807102,2,2,2,2,False
4399,"3p25.3p24.1(10,514,008-29,532,824)x2",GRCh38,3,10514008,29532824,19018817,2,2,2,2,False
5317,"8q12.3q21.13(64,242,384-83,258,323)x2",GRCh38,8,64242384,83258323,19015940,2,2,2,2,False
5338,"20q11.21q13.2(29,991,219-50,728,172)x2",GRCh38,20,29991219,50728172,20736954,2,2,2,2,False
5340,"10p15.3p15.1(307,807-5,146,667)x2",GRCh38,10,307807,5146667,4838861,2,2,2,2,False
7198,2q11.2q12.2(98703659_106160498)x2,GRCh37,2,98703659,106160498,7456840,2,2,2,2,False
7596,12q23.3q24.31(108904090_124118489)x2,GRCh37,12,108904090,124118489,15214400,2,2,2,2,False
15220,"11q24.1(122,984,093-123,106,251)x2",GRCh37,11,122984093,123106251,122159,2,2,2,2,False
15416,"7p12.3p11.2(46,591,048-57,560,040)x2",GRCh38,7,46591048,57560040,10968993,2,2,2,2,False


#### After inspection of orginal reports, it appears that these are all actually regions of homozygosity and minor typos/defects in regex matching strategy caused them not to be recognized as such
* "hmm" instead of "hmz"
* "," between "x2" and rest of variant, but "," was used as a tokenizing character between variant strings

In [30]:
da = da[(da["copy_number"] != "2") | (da["chromosome"].isin(["X", "Y"]))].copy()
len(da)

9000

### Remove any variants in build GRCh36

In [31]:
da = da[da["build"].isin(["GRCh37", "GRCh38"])]
len(da)

8989

In [32]:
da = da.reset_index()
del da["index"]

# Normalize NCH CNVs

In [33]:
### this cell runs the normalizer and saves the output
### runs in ~40s
### skip re-running by commenting out this cell

norm_responses = []
for ix, cnv in tqdm(da.iterrows(), total=len(da)):
    args = {
        "assembly": cnv["build"],
        "chromosome": f"chr{cnv['chromosome']}",
        "start0": int(cnv["start"]),
        "start_pos_type": "number",
        "end0": int(cnv["stop"]),
        "end_pos_type": "number",
    }
    if (
        cnv["copy_number_min"] == cnv["copy_number_max"]
    ):  ##if copy number is unambiguous, not a range
        args["copies0"] = int(cnv["copy_number_min"])
        args["copies_type"] = "number"
    else:  ##if copy number is a range e.g. 3~4
        args["copies0"] = int(cnv["copy_number_min"])
        args["copies1"] = int(cnv["copy_number_max"])
        args["copies_type"] = "definite_range"

    args["do_liftover"] = True
    args["untranslatable_returns_text"] = False

    rec = dict(cnv)
    resp = json.loads(parsed_to_cn_var(ParsedToCnVarQuery(**args)).model_dump_json(exclude_none=True))
    rec.update(resp)

    for arg, val in args.items():
        rec[f"query_args.{arg}"] = val

    norm_responses.append(rec)


## save raw normalizer output
with open("cnv_data/NCH-normalizer-results.json", "w") as f:
    json.dump(norm_responses, f)

  0%|          | 0/8989 [00:00<?, ?it/s]

In [34]:
with open("cnv_data/NCH-normalizer-results.json", "r") as f:
    norm_responses = json.load(f)

In [35]:
### read normalizer responses into a dataframe for analysis
dr = pd.json_normalize(norm_responses)
dr.head()

Unnamed: 0,variant,build,chromosome,start,stop,var_len,#positions,copy_number,copy_number_min,copy_number_max,region_of_homozygosity,warnings,query_args.assembly,query_args.chromosome,query_args.start0,query_args.start_pos_type,query_args.end0,query_args.end_pos_type,query_args.copies0,query_args.copies_type,query_args.do_liftover,query_args.untranslatable_returns_text,service_meta_.name,service_meta_.version,service_meta_.response_datetime,service_meta_.url,copy_number_count.id,copy_number_count.type,copy_number_count.digest,copy_number_count.location.id,copy_number_count.location.type,copy_number_count.location.digest,copy_number_count.location.sequenceReference.type,copy_number_count.location.sequenceReference.refgetAccession,copy_number_count.location.start,copy_number_count.location.end,copy_number_count.copies,query_args.copies1
0,7q35(146581450_146661741)x3,GRCh37,7,146581450,146661741,80292,2,3,3,3,False,[],GRCh37,chr7,146581450,number,146661741,number,3,number,True,False,variation-normalizer,0.15.0,2025-07-28T17:53:11.575012Z,https://github.com/cancervariants/variation-no...,ga4gh:CN.MJuX6gNGt0BJ1iHpVj_UVxBBya0Sp8n_,CopyNumberCount,MJuX6gNGt0BJ1iHpVj_UVxBBya0Sp8n_,ga4gh:SL.qmNJl0mIgQ5mtMf-9CLxIelFedebRN4-,SequenceLocation,qmNJl0mIgQ5mtMf-9CLxIelFedebRN4-,SequenceReference,SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul,146884357.0,146964649.0,3,
1,22q11.21q11.23(21798705_23751189)x3,GRCh37,22,21798705,23751189,1952485,2,3,3,3,False,[],GRCh37,chr22,21798705,number,23751189,number,3,number,True,False,variation-normalizer,0.15.0,2025-07-28T17:53:11.576953Z,https://github.com/cancervariants/variation-no...,ga4gh:CN.FluTAMBMb2-R0oBoWMz8R40Qn2Dj6KXP,CopyNumberCount,FluTAMBMb2-R0oBoWMz8R40Qn2Dj6KXP,ga4gh:SL.JiiE178XNyrPA_QIdD0_O8fllMElI4Fj,SequenceLocation,JiiE178XNyrPA_QIdD0_O8fllMElI4Fj,SequenceReference,SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ,21444415.0,23409002.0,3,
2,18p11.32(146484_408044)x1,GRCh37,18,146484,408044,261561,2,1,1,1,False,[],GRCh37,chr18,146484,number,408044,number,1,number,True,False,variation-normalizer,0.15.0,2025-07-28T17:53:11.579229Z,https://github.com/cancervariants/variation-no...,ga4gh:CN.B85IimabYh-WJ-r_tN6nmPkSCl9_rgWc,CopyNumberCount,B85IimabYh-WJ-r_tN6nmPkSCl9_rgWc,ga4gh:SL.Cu3iBhfqtXsd_cVsHtCdq325C6oAd0NH,SequenceLocation,Cu3iBhfqtXsd_cVsHtCdq325C6oAd0NH,SequenceReference,SQ.vWwFhJ5lQDMhh-czg06YtlWqu0lvFAZV,146483.0,408044.0,1,
3,3q25.1(150353937_150944378)x1,GRCh37,3,150353937,150944378,590442,2,1,1,1,False,[],GRCh37,chr3,150353937,number,150944378,number,1,number,True,False,variation-normalizer,0.15.0,2025-07-28T17:53:11.580966Z,https://github.com/cancervariants/variation-no...,ga4gh:CN.zS1jV37WemE7wBrf4Bkdt4VSxyNHUp_N,CopyNumberCount,zS1jV37WemE7wBrf4Bkdt4VSxyNHUp_N,ga4gh:SL.jQiTZDtAM_PWPlTA3l-BnS9UKIuLPiOs,SequenceLocation,jQiTZDtAM_PWPlTA3l-BnS9UKIuLPiOs,SequenceReference,SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX,150636149.0,151226590.0,1,
4,16p12.1(27864452_27995317)x3,GRCh37,16,27864452,27995317,130866,2,3,3,3,False,[],GRCh37,chr16,27864452,number,27995317,number,3,number,True,False,variation-normalizer,0.15.0,2025-07-28T17:53:11.582848Z,https://github.com/cancervariants/variation-no...,ga4gh:CN.Wqsbt4qGYELdnzSkqx0AOr_tpa8zJ8N1,CopyNumberCount,Wqsbt4qGYELdnzSkqx0AOr_tpa8zJ8N1,ga4gh:SL.MNRSCufvIZcW7fm8nOkL2aujzX1MmOFH,SequenceLocation,MNRSCufvIZcW7fm8nOkL2aujzX1MmOFH,SequenceReference,SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0,27853130.0,27983996.0,3,


In [36]:
## warnings were stored in a list; convert to a string for readability/ease of processing
dr["warning_string"] = dr["warnings"].apply(";".join)

In [37]:
### remove numbers/identifiers of specific chromosome/positions/variants from warning strings to consolidate into a small number of error types


def reduce_warning(warning: str) -> str:
    """removes specific numeric characters from error messages to reduce to broader category of error"""
    warning = re.sub(r"ga4gh\:SQ\.[A-Za-z0-9\-\_]+", "ga4gh:SQ.*", warning)
    warning = re.sub(r"chr[XY\d]+", "chr#", warning)
    warning = re.sub(r"\(\d+\)", "#", warning)
    warning = re.sub(r"pos \d+", "pos #", warning)
    if re.match(r"\d+ validation errors for", warning):
        warning = "Pydantic Validation Error"

    if warning == "":
        return "Success"
    return warning


dr["warning_string_reduce"] = dr["warning_string"].apply(reduce_warning)
dr["warning_string_reduce"].value_counts()

Success                                  8718
Unable to liftover: chr# with pos #       208
Position # is not valid on ga4gh:SQ.*      63
Name: count, dtype: int64

In [38]:
### get the normalized lifted over coordinates
da["start_38"] = dr["copy_number_count.location.start"]
da["stop_38"] = dr["copy_number_count.location.end"]

In [39]:
### restrict to variants with valid normalized positions
da = da[da["start_38"].notna() & da["stop_38"].notna()].copy()
da["start_38"] = da["start_38"].astype(int)
da["stop_38"] = da["stop_38"].astype(int)

In [40]:
len(da)

8718

In [41]:
da[
    [
        "variant",
        "build",
        "chromosome",
        "start",
        "stop",
        "start_38",
        "stop_38",
        "copy_number",
        "copy_number_min",
        "copy_number_max",
    ]
].dropna().to_csv("cnv_data/NCH-microarray-CNVs-cleaned.csv", index=False)