# Analysis for CIViC data

This notebook contains an analysis on CIViC variant data

In [1]:
from dotenv import load_dotenv

load_dotenv("../../../.env.shared")

True

In [2]:
import os
import sys
import logging
import re
import csv
from typing import Dict, Set, Tuple, Optional

from civicpy import civic as civicpy
from gene.database import create_db
from gene.query import QueryHandler as GeneQueryHandler
from variation.query import QueryHandler
from variation.schemas.service_schema import ClinVarAssembly

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from download_cool_seq_tool_files import download_cool_seq_tool_files  # noqa: E402
from utils import load_civicpy_cache, NotSupportedVariantCategory  # noqa: E402

log_filename = "civic-variation-analysis.log"
logging.basicConfig(
    filename=log_filename,
    format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)

  import pkg_resources


In [3]:
download_cool_seq_tool_files(is_docker_env=False)

In [4]:
gene_query_handler = GeneQueryHandler(create_db())
query_handler = QueryHandler(gene_query_handler=gene_query_handler)

***Using Gene Database Endpoint: http://localhost:8000***


In [5]:
# # Uncomment to get latest civic data and comment out the next cell
# civicpy.update_cache(from_remote_cache=False)

In [6]:
# Use latest cache that has been pushed to the repo
load_civicpy_cache()

Using cache-20250717.pkl for civicpy cache


In [7]:
acc_sub_variants = civicpy.get_all_variants(include_status=["accepted", "submitted"])
len_acc_sub_variants = len(acc_sub_variants)
f"Total Number of accepted and submitted variants in CIViC: {len_acc_sub_variants}"

'Total Number of accepted and submitted variants in CIViC: 3845'

Did not attempt to normalize `NotSupportedVariantCategory.TRANSCRIPT_VAR` due to no input query available. These are CIViC genomic variants ("c." in variant name) with no genomic HGVS expression or representative coordinates.

Below are variant names in CIViC that we know that the variation normalizer cannot support.

In [8]:
not_supported = {
    NotSupportedVariantCategory.EXPRESSION: {
        "overexpression",
        "expression",
        "underexpression",
        "serum levels",
        "transcription levels",
        "autocrine activation",
        "tnc-l",
        "top2a/90",
        "low ratio of vegf165b/vegftotal",
        "lgr5fl",
    },
    NotSupportedVariantCategory.EPIGENETIC_MODIFICATION: {
        "methylation",
        "promoter hypermethylation",
        "promoter methylation",
        "phosphorylation",
    },
    NotSupportedVariantCategory.SEQUENCE_VARS: {
        "frameshift truncation",
        "frameshift",
        "frame shift",
        "fs",
        "truncating mutation",
        "1100delc",
        "deletion (p.k227_t233del)",
        "y646f, y646n, y646s, y646h, y646c, a682g, a692v",
        "loss-of-modification",
        "t17 deletion",
        "ex19 del l858r",
        "single nucleotide variant",
    },
    NotSupportedVariantCategory.GENE_FUNC: {
        "gain of function",
        "gain-of-function",
        "loss of function",
        "loss-of-function",
        "activating mutation",
        "tkd mutation",
        "inactivation",
        "null",
        "viii",
    },
    NotSupportedVariantCategory.REARRANGEMENTS: {
        "translocation",
        "rearrangement",
        "double ph",
        "alu insertion",
        "exon 20 insertion",
        "internal tandem duplications",
        "tandem repeat",
        "itd",
        "d842_h845deldimh",
        "k558np",
        "exon",
    },
    NotSupportedVariantCategory.COPY_NUMBER: {
        "copy number",
        "repeat",
        "dup",
        "non-amplification",
        "gain",
    },
    NotSupportedVariantCategory.OTHER: {
        "cytoplasmic mislocalization",
        "alternative transcript",
        "rare mutation",
        "splice",
        "splicing",
        "ceacam1-l",
        "ceacam1-s",
        "δ",  # this is really Δ for upper case
        "delta",
        "beta",
        "ivs2+1g>a",
        "ivs20, a-g, -2",
        "deprecated",
        "point mutations",
        "conserved domain mut",
        "cis double mutants",
        "gbrcam",
        "kras4a",
        "kras4b",
        "e151int",
        "delnvtap",
    },
    NotSupportedVariantCategory.GENOTYPES_AND_HAPLOTYPES: {
        "diplotypes",
        "wild type",
        "wildtype",
        "p61braf(v600e)",
        "loss of heterozygosity",
        "biallelic inactivation",
        "bi-allelic inactivation",
        "homozygosity",
        "loh",
        "single allele deletion",
    },
    NotSupportedVariantCategory.REGION_DEFINED_VAR: {
        "deleterious mutation",
        "domain mutation",
        "polymorphism",
        "non-p-loop mutation",
        "p-loop mutation",
        "3' utr mutation",
        "3-prime utr mutation",
        "alteration",
        "promoter mutation",
        "non-v600",
    },
}

Create functions to be used later

In [9]:
def _total_counts() -> dict:
    """Return initial total counts for genomic and protein variants"""
    return {
        "protein": {"accepted": 0, "submitted": 0, "count": 0},
        "genomic": {"accepted": 0, "submitted": 0, "count": 0},
    }


def is_accepted_variant(v: civicpy.Variant) -> bool:
    """Return whether or not a variant (MPs) has at least one EID in an accepted status.

    :param v: CIViC variant
    :return: `True` if considered accepted variant. `False` otherwise.
    """
    for mp in v.molecular_profiles:
        for ev in mp.evidence_items:
            if ev.status == "accepted":
                return True
    return False


def get_variant_name_and_type(variant: civicpy.Variant) -> Tuple[Optional[str], str]:
    """Get transformed variant name and type

    :param variant: CIViC variant record
    :return: Tuple containing variant name to use in variation-normalizer (if found)
        and whether or not it's 'protein' or 'genomic' variant
    """
    v_name = None
    v_q_type = None

    if "c." in variant.name:
        # Try getting genomic HGVS expression first
        v_name = (
            [expr for expr in variant.hgvs_expressions if "g." in expr] or [None]
        )[0]

        # If there is no genomic HGVS expression, try using gnomad vcf
        if not v_name:
            chromosome = variant.coordinates.chromosome
            pos = variant.coordinates.start
            ref = variant.coordinates.reference_bases
            alt = variant.coordinates.variant_bases

            if all((chromosome, pos, ref, alt)):
                v_name = f"{chromosome}-{pos}-{ref}-{alt}"

        v_q_type = "genomic"
    else:
        v_name = variant.name.strip()
        v_q_type = "protein"

    return v_name, v_q_type


def translate_from_genomic(genomic_query: str) -> Dict:
    """Try using vrs-python translate from using genomic query that failed to normalize

    :param genomic_query: Genomic query (hgvs or gnomad VCF)
    :return: Response containing vrs_id (if successful) and errors (if unsuccessful)
    """
    resp = {"vrs_id": None, "error": None}
    try:
        translate_from_resp = query_handler.vrs_python_tlr.translate_from(
            genomic_query, assembly_name="GRCh37"
        )
    except Exception as e:
        resp["error"] = str(e)
    else:
        resp["vrs_id"] = translate_from_resp.id

    return resp


def get_not_supported_categories(
    v_name: str, variant: civicpy.Variant
) -> Set[NotSupportedVariantCategory]:
    """Get not supported categories for a CIViC variant.

    :param v_name: Variant query to provide to the variation-normalizer
    :param variant: CIViC variant record
    :return: Set of associated NotSupportedVariantCategory for a variant. If supported,
        empty set will be returned
    """
    v_name_lower = v_name.lower()
    categories = set()

    variant_subtype = variant.subtype
    if variant_subtype == "factor_variant":
        if variant.factor.name == "C19MC":
            categories.add(NotSupportedVariantCategory.REGION_DEFINED_VAR)
        else:
            categories.add(NotSupportedVariantCategory.GENOME_FEATURES)
    elif variant_subtype == "fusion_variant":
        categories.add(NotSupportedVariantCategory.FUSION)
    elif v_name_lower in {"loss", "deletion"}:
        categories.add(NotSupportedVariantCategory.GENE_FUNC)
    elif any(
        (
            v_name_lower in {"mutation", "mutations", "snp"},
            hasattr(variant, "gene")
            and v_name_lower == f"{variant.gene.name.lower()} mutation",
        )
    ):
        categories.add(NotSupportedVariantCategory.REGION_DEFINED_VAR)
    else:
        if v_name_lower.endswith("deletion and mutation"):
            v_name_split = v_name.split()
            if len(v_name_split) == 4:
                if gene_query_handler.normalize(v_name_split[0]).match_type > 0:
                    categories.add(NotSupportedVariantCategory.REGION_DEFINED_VAR)

        if re.match(r"intron\s\d+\smutation", v_name_lower):  # ex: Intron 6 Mutation
            categories.add(NotSupportedVariantCategory.REGION_DEFINED_VAR)

        if any(
            (
                "exon" in v_name_lower,
                re.match(r"t\(.*\)\(.*\)", v_name_lower),  # ex: t(1;3)(p36.3;p25)
                re.match(r".*ins$", v_name_lower),  # ex: P780INS, L78_Q79ins
                re.match(
                    r"\w+_?\w+>\w+", v_name_lower
                ),  # ex: 56_61QKQKVG>R, E746_T751>I, N771>GY
                re.match(r"\d+kb\sdeletion", v_name_lower),  # ex: 10kb Deletion
                re.match(
                    r"partial\sdeletion\sof\s\d+(.\d+)?\skb", v_name_lower
                ),  # ex: Partial deletion of 0.7 Kb
                re.match(
                    r"\d+(p|q)\d+(.\d+)?-\d+(.\d+)?\s\d+mb del", v_name_lower
                ),  # ex: 3p26.3-25.3 11Mb del
            )
        ):
            categories.add(NotSupportedVariantCategory.REARRANGEMENTS)

        if any(
            (
                re.match(r"^rs\d+", v_name_lower),  # ex: RS11623866
                re.match(r"class\s\d+\smutation", v_name_lower),  # ex: Class 3 Mutation
            )
        ):
            categories.add(NotSupportedVariantCategory.OTHER)

        if re.match(r"cd\d+v?\d+", v_name_lower):  # cd44, cd44v6
            categories.add(NotSupportedVariantCategory.EXPRESSION)

        if any(
            (
                re.match(r"\w+\d+$", v_name_lower),  # ex: V600
                re.match(r"\w+\d+\w+\/\w+$", v_name_lower),  # ex: S893A/T
                re.match(
                    r"[a-z]+\d+[a-z]+\sand\s[a-z]+\d+[a-z|*]+", v_name_lower
                ),  # ex: E2014K and E2419K, R849W and R1108*
                re.match(r"[a-z]+\d+\s&\s[a-z]+\d+", v_name_lower),  # ex: D835 & I836
                re.match(
                    r"[a-z]+\d+[a-z]+\sor\s[a-z]+\d+[a-z]+", v_name_lower
                ),  # ex: H1047L or H1047R
                re.match(r"\w+\d+\smutations", v_name_lower),  # ex: E1813 mutations
                re.match(
                    r"\d+\s\((c|a|g|t)+-(c|a|g|t)+\)", v_name_lower
                ),  # ex: 235 (CAG-TAG)
                re.match(r"del\s\d+-\d+", v_name_lower),  # ex: DEL 485-490
            )
        ):
            categories.add(NotSupportedVariantCategory.SEQUENCE_VARS)

        if re.match(
            r"grch3(7|8)\/hg\d+\s\w+.?\d*\(chr\w+:\d+-\d+\)x\d+", v_name_lower
        ):  # ex: GRCh37/hg19 11q14.3(chr11:88960991-88961138)x160
            categories.add(NotSupportedVariantCategory.COPY_NUMBER)

        if re.match(r"\w+[^fs]\*\d+$", v_name_lower):  # ex: UGT1A1*28
            categories.add(NotSupportedVariantCategory.GENOTYPES_AND_HAPLOTYPES)

        if re.match(r"^\*02:(?:0[1-3]|06)p$", v_name_lower):
            categories.add(NotSupportedVariantCategory.GENOTYPES_AND_HAPLOTYPES)

        for k, v in not_supported.items():
            if {x for x in v if x in v_name_lower}:
                categories.add(k)

    if len(categories) > 1:
        # Those with multiple categories will be classified as other
        categories = {NotSupportedVariantCategory.OTHER}

    return categories

Create files

In [10]:
# This file contains protein queries (gene + variant_name) we SHOULD be able to
# normalize
protein_variants_wf = open(
    "should_be_able_to_normalize_protein_variant_queries.tsv", "w"
)
protein_variants_wr = csv.writer(protein_variants_wf, delimiter="\t")
protein_variants_wr.writerow(
    [
        "variant_id",
        "gene_name",
        "variant_name",
        "variant_accepted",
        "civic_variant_types",
    ]
)

# This file contains genomic queries (genomic HGVS expressions or gnomad VCF) we SHOULD
# be able to normalize
genomic_variants_wf = open(
    "should_be_able_to_normalize_genomic_variant_queries.tsv", "w"
)
genomic_variants_wr = csv.writer(genomic_variants_wf, delimiter="\t")
genomic_variants_wr.writerow(
    ["variant_id", "hgvs_g", "variant_accepted", "civic_variant_types"]
)

# This file contains CIViC Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.tsv", "w")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(
    [
        "variant_id",
        "gene_name",
        "variant_name",
        "civic_variant_types",
        "category",
        "variant_accepted",
    ]
)

# This file contains CIViC Variant queries that were run through Variation Normalizer,
# but failed to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.tsv", "w")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(
    [
        "variant_id",
        "query",
        "query_type",
        "variant_accepted",
        "civic_variant_types",
        "exception_raised",
        "message",
        "warnings",
    ]
)

# This file contains CIViC Variant queries that were run through variation normalizer,
# and successfully normalized
able_to_normalize_wf = open("able_to_normalize_queries.tsv", "w")
able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
able_to_normalize_wr.writerow(
    [
        "variant_id",
        "query",
        "query_type",
        "variant_accepted",
        "civic_variant_types",
        "vrs_id",
        "succeeded_endpoint",
    ]
)

92

Create variables to store information, such as counts, that will be used later

In [11]:
# Category name for variants we do not support: number of variants we found
variant_category_counts = {c: 0 for c in NotSupportedVariantCategory.__members__}

# Keep track of total counts
should_be_able_to_normalize_total = _total_counts()
can_normalize_total = _total_counts()
unable_to_normalize_total = _total_counts()
exception_total = _total_counts()
genomic_translate_from_success = 0

# Used to find duplicate queries
queries_found = dict()

In [12]:
for variant in acc_sub_variants:
    civic_variant_types = (
        ";".join([v.name for v in variant.variant_types]) or "Not provided"
    )

    # if a variant has at least one EID in an accepted status, it counts towards
    # “accepted”, because that indicates review and approval of the variant as part
    # of the evidence review
    is_accepted = is_accepted_variant(variant)
    accepted_key = (
        "accepted" if is_accepted else "submitted"
    )  # used in total counts dicts

    # Get variant name and type
    v_name, v_q_type = get_variant_name_and_type(variant)

    try:
        gene_name = variant.gene.name.strip()
    except AttributeError:
        gene_name = None

    if not v_name:
        variant_category_name = NotSupportedVariantCategory.TRANSCRIPT_VAR
        variant_category_counts[variant_category_name.name] += 1
        not_supported_wr.writerow(
            [
                variant.id,
                gene_name,
                None,
                civic_variant_types,
                variant_category_name,
                is_accepted,
            ]
        )
        continue

    # Determine if variant is not supported
    not_supported_categories = get_not_supported_categories(v_name, variant)

    if len(not_supported_categories) == 1:
        variant_category_name = not_supported_categories.pop()
        variant_category_counts[variant_category_name.name] += 1
        not_supported_wr.writerow(
            [
                variant.id,
                gene_name,
                variant.name,
                civic_variant_types,
                variant_category_name,
                is_accepted,
            ]
        )
        continue

    # We should support this, so we need to query the variation normalizer
    if v_q_type == "protein":
        q = f"{gene_name} {v_name}"
        protein_variants_wr.writerow(
            [variant.id, gene_name, v_name, is_accepted, civic_variant_types]
        )
    else:
        q = v_name
        genomic_variants_wr.writerow([variant.id, q, is_accepted, civic_variant_types])

    should_be_able_to_normalize_total[v_q_type]["count"] += 1
    should_be_able_to_normalize_total[v_q_type][accepted_key] += 1

    if q in queries_found:
        queries_found[q].append(variant.id)
    else:
        queries_found[q] = [variant.id]

    try:
        variation_norm_resp = await query_handler.normalize_handler.normalize(
            q, input_assembly=ClinVarAssembly.GRCH37
        )
        if not variation_norm_resp.variation:
            if v_q_type == "protein" and len(v_name.split()) == 1:
                # Determine if fusion or gene name (which actually aren't supported)
                if "-" in v_name:
                    # could be {gene}-{gene}
                    genes = v_name.split("-")
                    variant_category_name = NotSupportedVariantCategory.FUSION
                else:
                    # Just a gene name
                    genes = [v_name]
                    variant_category_name = NotSupportedVariantCategory.OTHER

                is_genes = True
                for g in genes:
                    if gene_query_handler.normalize(g).match_type == 0:
                        # not a gene
                        is_genes = False
                        break

                if is_genes:
                    variant_category_counts[variant_category_name.name] += 1
                    not_supported_wr.writerow(
                        [
                            variant.id,
                            gene_name,
                            variant.name,
                            civic_variant_types,
                            variant_category_name,
                            is_accepted,
                        ]
                    )
                    continue

            no_vrs_id = True
            warnings = variation_norm_resp.warnings or []

            # Try running genomic queries through vrs-python translate from
            if v_q_type == "genomic":
                genomic_resp = translate_from_genomic(q)

                if genomic_resp["vrs_id"]:
                    genomic_translate_from_success += 1
                    no_vrs_id = False
                    vrs_id = genomic_resp["vrs_id"]
                else:
                    warnings.append(genomic_resp["error"])

            if no_vrs_id:
                unable_to_normalize_wr.writerow(
                    [
                        variant.id,
                        q,
                        v_q_type,
                        is_accepted,
                        civic_variant_types,
                        False,
                        "unable to normalize",
                        sorted(warnings),
                    ]
                )
                unable_to_normalize_total[v_q_type]["count"] += 1
                unable_to_normalize_total[v_q_type][accepted_key] += 1
            else:
                can_normalize_total[v_q_type]["count"] += 1
                can_normalize_total[v_q_type][accepted_key] += 1
                able_to_normalize_wr.writerow(
                    [
                        variant.id,
                        q,
                        v_q_type,
                        is_accepted,
                        civic_variant_types,
                        vrs_id,
                        "translate_from",
                    ]
                )
        else:
            can_normalize_total[v_q_type]["count"] += 1
            can_normalize_total[v_q_type][accepted_key] += 1
            vrs_id = variation_norm_resp.variation.id
            able_to_normalize_wr.writerow(
                [
                    variant.id,
                    q,
                    v_q_type,
                    is_accepted,
                    civic_variant_types,
                    vrs_id,
                    "normalize",
                ]
            )
    except Exception as e:
        warnings = [str(e)]

        # Try running genomic queries through vrs-python translate from
        if v_q_type == "genomic":
            genomic_resp = translate_from_genomic(q)

            if genomic_resp["vrs_id"]:
                vrs_id = genomic_resp["vrs_id"]
            else:
                vrs_id = None
                warnings.append(genomic_resp["error"])

            if vrs_id:
                genomic_translate_from_success += 1
                can_normalize_total[v_q_type]["count"] += 1
                can_normalize_total[v_q_type][accepted_key] += 1
                able_to_normalize_wr.writerow(
                    [
                        variant.id,
                        q,
                        v_q_type,
                        is_accepted,
                        civic_variant_types,
                        vrs_id,
                        "translate_from",
                    ]
                )
                continue

        unable_to_normalize_wr.writerow(
            [
                variant.id,
                q,
                v_q_type,
                is_accepted,
                civic_variant_types,
                True,
                sorted(warnings),
                None,
            ]
        )
        exception_total[v_q_type]["count"] += 1
        exception_total[v_q_type][accepted_key] += 1

In [13]:
# Close all files
protein_variants_wf.close()
genomic_variants_wf.close()
not_supported_wf.close()
unable_to_normalize_wf.close()
able_to_normalize_wf.close()

## Variants we do not support

In [14]:
do_not_support_total_sum = sum(variant_category_counts.values())

Below are the total number of variants for each category that we do not support

In [15]:
sorted_variant_cat_counts = dict(
    sorted(variant_category_counts.items(), key=lambda x: x[1], reverse=True)
)
sorted_variant_cat_counts

{'TRANSCRIPT_VAR': 362,
 'FUSION': 313,
 'EXPRESSION': 293,
 'REGION_DEFINED_VAR': 252,
 'SEQUENCE_VARS': 130,
 'REARRANGEMENTS': 121,
 'GENE_FUNC': 111,
 'OTHER': 79,
 'COPY_NUMBER': 32,
 'GENOTYPES_AND_HAPLOTYPES': 22,
 'EPIGENETIC_MODIFICATION': 14,
 'GENOME_FEATURES': 10}

Below is the total percentage of variants for each category that we do not support

In [16]:
{
    k: f"{v / len_acc_sub_variants * 100:.2f}%"
    for k, v in sorted_variant_cat_counts.items()
}

{'TRANSCRIPT_VAR': '9.41%',
 'FUSION': '8.14%',
 'EXPRESSION': '7.62%',
 'REGION_DEFINED_VAR': '6.55%',
 'SEQUENCE_VARS': '3.38%',
 'REARRANGEMENTS': '3.15%',
 'GENE_FUNC': '2.89%',
 'OTHER': '2.05%',
 'COPY_NUMBER': '0.83%',
 'GENOTYPES_AND_HAPLOTYPES': '0.57%',
 'EPIGENETIC_MODIFICATION': '0.36%',
 'GENOME_FEATURES': '0.26%'}

In [17]:
f"The Variation Normalizer does not support {do_not_support_total_sum / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants"

'The Variation Normalizer does not support 45.23% of the total accepted and submitted variants'

In [18]:
f"Total number of variants we do not support in the Variation Normalizer: {do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 1739'

## Variants we should be able to normalize

In [19]:
should_be_able_to_normalize_total

{'protein': {'accepted': 747, 'submitted': 940, 'count': 1687},
 'genomic': {'accepted': 246, 'submitted': 177, 'count': 423}}

In [20]:
should_be_able_to_normalize_total_sum = (
    should_be_able_to_normalize_total["protein"]["count"]
    + should_be_able_to_normalize_total["genomic"]["count"]
)
should_be_able_to_normalize_total_sum

2110

In [21]:
should_be_able_to_normalize_total_accepted = (
    should_be_able_to_normalize_total["protein"]["accepted"]
    + should_be_able_to_normalize_total["genomic"]["accepted"]
)
f"{should_be_able_to_normalize_total_accepted / should_be_able_to_normalize_total_sum * 100:.2f}% of these are accepted variants"

'47.06% of these are accepted variants'

In [22]:
should_be_able_to_normalize_total_not_accepted = (
    should_be_able_to_normalize_total["protein"]["submitted"]
    + should_be_able_to_normalize_total["genomic"]["submitted"]
)
f"{should_be_able_to_normalize_total_not_accepted / should_be_able_to_normalize_total_sum * 100:.2f}% of these are submitted variants"

'52.94% of these are submitted variants'

In [23]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total_sum / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants"

'The Variation Normalizer SHOULD be able to normalize 54.88% of the total accepted and submitted variants'

## Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [24]:
unable_to_normalize_total

{'protein': {'accepted': 15, 'submitted': 76, 'count': 91},
 'genomic': {'accepted': 0, 'submitted': 0, 'count': 0}}

In [25]:
unable_to_normalize_total_sum = (
    unable_to_normalize_total["protein"]["count"]
    + unable_to_normalize_total["genomic"]["count"]
)
unable_to_normalize_total_sum

91

In [26]:
unable_to_normalize_total_accepted = (
    unable_to_normalize_total["protein"]["accepted"]
    + unable_to_normalize_total["genomic"]["accepted"]
)
f"{unable_to_normalize_total_accepted / unable_to_normalize_total_sum * 100:.2f}% of these are accepted variants"

'16.48% of these are accepted variants'

In [27]:
unable_to_normalize_total_not_accepted = (
    unable_to_normalize_total["protein"]["submitted"]
    + unable_to_normalize_total["genomic"]["submitted"]
)
f"{unable_to_normalize_total_not_accepted / unable_to_normalize_total_sum * 100:.2f}% of these are submitted variants"

'83.52% of these are submitted variants'

In [28]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total_sum / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants"

'The Variation Normalizer was unable to normalize 2.37% of the total accepted and submitted variants'

## Breakdown of the variants we weren't able to normalize

In this section, we breakdown the reasons on why we weren't able to normalize variants.

In [29]:
unable_to_tokenize = 0
unable_to_find_valid = 0
other = 0
with open("unable_to_normalize_queries.tsv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if "Unable to find valid result" in row[-1]:
            unable_to_find_valid += 1
        elif "Unable to tokenize" in row[-1]:
            unable_to_tokenize += 1
        else:
            other += 1

### Due to not passing validation checks

The Variation Normalizer performs validation checks on the input query (such as reference sequence). If these validation checks fail, then the input query will fail to normalize.

In [30]:
f"The Variation Normalizer found {unable_to_find_valid} invalid variants (This is {unable_to_find_valid / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants)."

'The Variation Normalizer found 0 invalid variants (This is 0.00% of the total accepted and submitted variants).'

### Due to tokenization 

The Variation Normalizer will tokenize the input query to determine the kind of token. It is limited in the kinds of tokens it accepts, so these tokens are not yet supported in the Variation Normalizer. 

In [31]:
f"The Variation Normalizer was unable to tokenize {unable_to_tokenize} variants ({unable_to_tokenize / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants)."

'The Variation Normalizer was unable to tokenize 35 variants (0.91% of the total accepted and submitted variants).'

In [32]:
f"The Variation Normalizer was unable to normalize {other} variants due to other issues (This is {other / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants)."

'The Variation Normalizer was unable to normalize 56 variants due to other issues (This is 1.46% of the total accepted and submitted variants).'

## Variant queries that raised an exception during normalization

In [33]:
exception_total

{'protein': {'accepted': 0, 'submitted': 0, 'count': 0},
 'genomic': {'accepted': 0, 'submitted': 0, 'count': 0}}

In [34]:
exception_total_sum = (
    exception_total["protein"]["count"] + exception_total["genomic"]["count"]
)
exception_total_sum

0

In [35]:
f"The Variation Normalizer raised an exception for {exception_total_sum / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants"

'The Variation Normalizer raised an exception for 0.00% of the total accepted and submitted variants'

## Variants we were able to normalize

In [36]:
can_normalize_total

{'protein': {'accepted': 730, 'submitted': 862, 'count': 1592},
 'genomic': {'accepted': 246, 'submitted': 177, 'count': 423}}

In [37]:
can_normalize_total_sum = (
    can_normalize_total["protein"]["count"] + can_normalize_total["genomic"]["count"]
)
can_normalize_total_sum

2015

In [38]:
can_normalize_total_accepted = (
    can_normalize_total["protein"]["accepted"]
    + can_normalize_total["genomic"]["accepted"]
)
f"{can_normalize_total_accepted / can_normalize_total_sum * 100:.2f}% of these are accepted variants"

'48.44% of these are accepted variants'

In [39]:
can_normalize_total_not_accepted = (
    can_normalize_total["protein"]["submitted"]
    + can_normalize_total["genomic"]["submitted"]
)
f"{can_normalize_total_not_accepted / can_normalize_total_sum * 100:.2f}% of these are submitted variants"

'51.56% of these are submitted variants'

In [40]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / should_be_able_to_normalize_total_sum * 100:.2f}% of the variants we SHOULD be able to normalize"

'The Variation Normalizer successfully normalized 95.50% of the variants we SHOULD be able to normalize'

In [41]:
f"The Variation Normalizer successfully normalized {can_normalize_total_sum / len_acc_sub_variants * 100:.2f}% of the total accepted and submitted variants"

'The Variation Normalizer successfully normalized 52.41% of the total accepted and submitted variants'

In [42]:
f"The Variation Normalizer was not able to normalize {genomic_translate_from_success} genomic variants (count) but vrs-python translate was. These were included in the able to normalize total."

'The Variation Normalizer was not able to normalize 1 genomic variants (count) but vrs-python translate was. These were included in the able to normalize total.'

## Duplicate Queries

These are duplicate queries found in civic. The values are the associated variant IDs. 

In [43]:
{k: v for k, v in queries_found.items() if len(v) > 1}

{'ERBB2 A775_G776insYVMA': [2658, 4483],
 'KRAS G12V': [147, 425],
 'ERBB2 L755P': [1304, 4490],
 'BRAF V600D': [11, 3452],
 'ERBB2 V842I': [45, 4588]}