# Analysis for MOAlmanac data

This notebook contains an analysis on MOAlmanac feature (variant) data

In [1]:
# Load environment variables. This MUST be the first cell.
from dotenv import load_dotenv

load_dotenv("../../../.env.shared")

True

In [2]:
import csv
import json  # noqa: F401
import logging
import os
import re
import sys
import zipfile  # noqa: F401
from typing import Dict, Tuple, Optional

import requests  # noqa: F401
from datetime import datetime  # noqa: F401
from variation.query import QueryHandler

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import NotSupportedVariantCategory, MoaItemType, load_latest_moa_zip  # noqa: E402


log_filename = "moa-feature-analysis.log"
logging.basicConfig(
    filename=log_filename,
    format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)

  import pkg_resources


In [3]:
query_handler = QueryHandler()

***Using Gene Database Endpoint: http://localhost:8000***


In [4]:
# # Uncomment to get latest moa data and comment out the next cell
# for k in ["features", "assertions"]:
#     r = requests.get(f"https://moalmanac.org/api/{k}")
#     variants_resp = r.json()
#     date = datetime.today().strftime("%Y%m%d")
#     fn = f"moa_{k}_{date}.json"
#     with zipfile.ZipFile(f"{fn}.zip", "w") as zf:
#         with open(f"moa_{k}_{date}.json", "w+") as wf:
#             wf.write(json.dumps(variants_resp, indent=4))
#         zf.write(f"moa_{k}_{date}.json")

In [5]:
# Use latest zip that has been pushed to the repo
variants_resp = load_latest_moa_zip(MoaItemType.FEATURE)

Using moa_features_20250717.json for MOA features


In [6]:
# Map MOA Feature Type to Categories
MOA_FT_TO_VARIANT_CATEGORIES = {
    "rearrangement": NotSupportedVariantCategory.REARRANGEMENT,
    "copy_number": NotSupportedVariantCategory.COPY_NUMBER,
    "microsatellite_stability": NotSupportedVariantCategory.OTHER,
    "mutational_signature": NotSupportedVariantCategory.OTHER,
    "mutational_burden": NotSupportedVariantCategory.OTHER,
    "neoantigen_burden": NotSupportedVariantCategory.OTHER,
    "knockdown": NotSupportedVariantCategory.EXPRESSION,
    "silencing": NotSupportedVariantCategory.EXPRESSION,
    "aneuploidy": NotSupportedVariantCategory.COPY_NUMBER,
}

Create functions to be used later

In [7]:
def get_feature(variant_record: Dict, feature_type: str) -> Tuple[Optional[str], bool]:
    """Get feature label to be used in the variation-normalizer and whether or not its
    supported by the variation-normalizer

    :param variant_record: MOA variant
    :param feature_type: MOA feature type for `variant_record`
    :return: Tuple containing the feature label and whether or not its supported by the
        variation-normalizer
    """
    supported = False
    feature = None

    if feature_type == "rearrangement":
        gene1 = variant_record["gene1"] if variant_record.get("gene1") else ""
        gene2 = f"--{variant_record['gene2']}" if variant_record.get("gene2") else ""
        locus = f" {variant_record['locus']}" if variant_record.get("locus") else ""
        rearrangement_type = (
            f" {variant_record['rearrangement_type']}"
            if variant_record.get("rearrangement_type")
            else ""
        )
        feature = f"{gene1}{gene2}{locus}{rearrangement_type}"
    elif feature_type in {"somatic_variant", "germline_variant"}:
        gene = variant_record["gene"] if variant_record.get("gene") else ""
        protein_change = (
            f" {variant_record['protein_change']}"
            if variant_record.get("protein_change")
            else ""
        )
        feature = f"{gene}{protein_change}"
        supported = True
    elif feature_type == "copy_number":
        # These are all {gene} Amplification|Deletion
        feature = f"{variant_record['gene']} {variant_record['direction']}"
        if variant_record["direction"] == "Amplification":
            supported = True
    elif feature_type == "microsatellite_stability":
        # The only one is MSI-High
        feature = str(variant_record.get("status"))
    elif feature_type == "mutational_signature":
        cs = variant_record["cosmic_signature"]
        feature = f"COSMIC Signature {cs}"
    elif feature_type == "mutational_burden":
        clss = variant_record["classification"]
        min_mut = variant_record["minimum_mutations"]
        mut_per_mb = variant_record["mutations_per_mb"]
        muts = (
            f" (>= {min_mut} mutations)"
            if min_mut
            else (f" (>= {mut_per_mb} mutations/Mb)" if mut_per_mb else "")
        )
        feature = f"{clss}{muts}"
    elif feature_type == "neoantigen_burden":
        # Doesn't seem like there are any of these
        feature = str(variant_record["classification"])
    elif feature_type == "knockdown" or feature_type == "silencing":
        gene = variant_record["gene"]
        technique = (
            f" ({variant_record['technique']})" if variant_record["technique"] else ""
        )
        feature = f"{gene}{technique}"
    elif feature_type == "aneuploidy":
        # The only one is Whole genome doubling
        feature = str(variant_record["event"])

    return feature, supported


def get_category(
    feature_type: str,
    variant_record: Dict,
) -> NotSupportedVariantCategory:
    """Get category name for MOA feature

    :param feature_type: MOA feature type for `variant_record`
    :param variant_record: MOA variant
    :return: Category for MOA feature
    """
    if feature_type in {"somatic_variant", "germline_variant"}:
        variant_annotation = variant_record.get("variant_annotation")

        if not variant_annotation:
            category_name = NotSupportedVariantCategory.REGION_DEFINED
        elif variant_annotation in {
            "Nonsense",
            "Oncogenic Mutations",
            "Activating mutation",
        }:
            category_name = NotSupportedVariantCategory.GENE_FUNCTION
        elif variant_annotation in {"Frameshift", "Insertion", "Missense"}:
            category_name = NotSupportedVariantCategory.SEQUENCE
        elif variant_annotation == "Splice Site":
            category_name = NotSupportedVariantCategory.OTHER
        elif variant_annotation == "Deletion":
            if variant_record.get("exon"):
                category_name = NotSupportedVariantCategory.SEQUENCE
            else:
                category_name = NotSupportedVariantCategory.GENE_FUNCTION
        else:
            raise NotImplementedError(
                f"{variant_annotation} does not map to a variant category"
            )
    else:
        category_name = MOA_FT_TO_VARIANT_CATEGORIES[feature_type]

    return category_name

Create files

In [8]:
# This file contains MOA Variants where there are no queries available.
no_query_wf = open("no_query.tsv", "w")
no_query_wr = csv.writer(no_query_wf, delimiter="\t")
no_query_wr.writerow(["variant_id", "feature"])

# This file contains all queries we SHOULD be able to normalize
all_queries_wf = open("should_be_able_to_normalize_queries.tsv", "w")
all_queries_wr = csv.writer(all_queries_wf, delimiter="\t")
all_queries_wr.writerow(["variant_id", "query", "moa_feature_type", "category"])

# This file contains MOAlmanac Variants we do not currently support in Variation Normalizer.
# In these cases, we do not even attempt to try to normalize
not_supported_wf = open("not_supported_variants.tsv", "w")
not_supported_wr = csv.writer(not_supported_wf, delimiter="\t")
not_supported_wr.writerow(["variant_id", "query", "moa_feature_type", "category"])

# This file contains MOAlmanac Variant queries that were run through the Variation Normalizer,
# but failed to normalize.
unable_to_normalize_wf = open("unable_to_normalize_queries.tsv", "w")
unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
unable_to_normalize_wr.writerow(
    [
        "variant_id",
        "query",
        "moa_feature_type",
        "category",
        "exception_raised",
        "message",
        "warnings",
    ]
)

# This file contains MOAlmanac Variant queries that were run through the Variation Normalizer,
# and successfully normalized
able_to_normalize_wf = open("able_to_normalize_queries.tsv", "w")
able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
able_to_normalize_wr.writerow(
    ["variant_id", "query", "moa_feature_type", "category", "vrs_id"]
)

51

Create variables to store information, such as counts, that will be used later

In [9]:
# Category name for variants we do not support: number of variants we found
not_supported_feature_counts = {c: 0 for c in NotSupportedVariantCategory.__members__}

# Keep track of total counts
total_variants = 0
no_query_total = 0
should_be_able_to_normalize_total = 0
can_normalize_total = 0
unable_to_normalize_total = 0
exception_total = 0

In [10]:
for v in variants_resp:
    total_variants += 1
    variant_id = v["feature_id"]
    variant_record = {"id": variant_id}
    variant_record.update(v["attributes"][0])

    feature_type = variant_record["feature_type"]
    feature, supported = get_feature(variant_record, feature_type)

    if not feature:
        no_query_total += 1
        no_query_wr.writerow([variant_id, v])
        continue

    category_name = None
    if any(
        (
            re.match(r".+fs\*\d+$", feature, re.IGNORECASE),
            re.match(r".+\d+$", feature, re.IGNORECASE),
        )
    ):
        supported = False
        category_name = NotSupportedVariantCategory.SEQUENCE

    if not category_name:
        category_name = get_category(feature_type, variant_record)

    if not supported or len(feature.split()) == 1:
        not_supported_feature_counts[category_name.name] += 1
        not_supported_wr.writerow([variant_id, feature, feature_type, category_name])
        continue

    should_be_able_to_normalize_total += 1
    all_queries_wr.writerow([variant_id, feature, feature_type, category_name])

    try:
        variation_norm_resp = await query_handler.normalize_handler.normalize(feature)
        if not variation_norm_resp.variation:
            unable_to_normalize_wr.writerow(
                [
                    variant_id,
                    feature,
                    feature_type,
                    category_name,
                    False,
                    "unable to normalize",
                    variation_norm_resp.warnings,
                ]
            )
            unable_to_normalize_total += 1
        else:
            can_normalize_total += 1
            vrs_id = variation_norm_resp.variation.id
            able_to_normalize_wr.writerow(
                [variant_id, feature, feature_type, category_name, vrs_id]
            )
    except Exception as e:
        unable_to_normalize_wr.writerow(
            [variant_id, feature, feature_type, category_name, True, str(e), None]
        )
        exception_total += 1

In [11]:
# Close files
no_query_wf.close()
all_queries_wf.close()
not_supported_wf.close()
unable_to_normalize_wf.close()
able_to_normalize_wf.close()

In [12]:
f"Total number of variants in MOAlmanac: {total_variants}"

'Total number of variants in MOAlmanac: 452'

# Variants that we could not find queries for

In [13]:
no_query_total

0

In [14]:
f"{no_query_total / total_variants * 100:.2f}% of the total features had no queries"

'0.00% of the total features had no queries'

# Variants we do not support

In [15]:
do_not_support_total_sum = sum(not_supported_feature_counts.values())
f"Total number of variants we do not support in the Variation Normalizer: {do_not_support_total_sum}"

'Total number of variants we do not support in the Variation Normalizer: 256'

In [16]:
f"The Variation Normalizer does not support {do_not_support_total_sum / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer does not support 56.64% of the total variants'

Below are the total number of variants for each MOA Feature Type that we do not support

In [17]:
sorted_not_sup_counts = dict(
    sorted(not_supported_feature_counts.items(), key=lambda x: x[1], reverse=True)
)
sorted_not_sup_counts

{'SEQUENCE': 127,
 'REGION_DEFINED': 40,
 'REARRANGEMENT': 35,
 'COPY_NUMBER': 23,
 'OTHER': 12,
 'EXPRESSION': 11,
 'GENE_FUNCTION': 8,
 'GENOTYPE_AND_HAPLOTYPE': 0,
 'FUSION': 0,
 'EPIGENETIC_MODIFICATION': 0,
 'GENOME_FEATURE': 0,
 'TRANSCRIPT': 0}

Below is the total percentage of variants for each MOA Feature Type that we do not support

In [18]:
{k: f"{v / total_variants * 100:.2f}%" for k, v in sorted_not_sup_counts.items()}

{'SEQUENCE': '28.10%',
 'REGION_DEFINED': '8.85%',
 'REARRANGEMENT': '7.74%',
 'COPY_NUMBER': '5.09%',
 'OTHER': '2.65%',
 'EXPRESSION': '2.43%',
 'GENE_FUNCTION': '1.77%',
 'GENOTYPE_AND_HAPLOTYPE': '0.00%',
 'FUSION': '0.00%',
 'EPIGENETIC_MODIFICATION': '0.00%',
 'GENOME_FEATURE': '0.00%',
 'TRANSCRIPT': '0.00%'}

# Variants we should be able to normalize

In [19]:
should_be_able_to_normalize_total

196

In [20]:
f"The Variation Normalizer SHOULD be able to normalize {should_be_able_to_normalize_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer SHOULD be able to normalize 43.36% of the total variants'

# Variants we were not able to normalize

Either due to a bug or an unsupported query type in Variation Normalizer

In [21]:
unable_to_normalize_total

0

In [22]:
f"The Variation Normalizer was unable to normalize {unable_to_normalize_total / total_variants * 100}% of the total variants"

'The Variation Normalizer was unable to normalize 0.0% of the total variants'

## Breakdown of the variants we weren't able to normalize

In this section, we breakdown the reasons on why we weren't able to normalize variants.

In [23]:
unable_to_tokenize = 0
unable_to_find_valid = 0
other = 0
with open("unable_to_normalize_queries.tsv", "r") as f:
    reader = csv.reader(f, delimiter="\t")
    next(reader)
    for row in reader:
        if "Unable to find valid result" in row[-1]:
            unable_to_find_valid += 1
        elif "Unable to tokenize" in row[-1]:
            unable_to_tokenize += 1
        else:
            other += 1

### Due to not passing validation checks

The Variation Normalizer performs validation checks on the input query (such as reference sequence). If these validation checks fail, then the input query will fail to normalize.

In [24]:
f"The Variation Normalizer found {unable_to_find_valid} invalid variants (This is {unable_to_find_valid / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer found 0 invalid variants (This is 0.00% of the total variants).'

### Due to tokenization 

The Variation Normalizer will tokenize the input query to determine the kind of token. It is limited in the kinds of tokens it accepts, so these tokens are not yet supported in the Variation Normalizer. 

In [25]:
f"The Variation Normalizer was unable to tokenize {unable_to_tokenize} variants ({unable_to_tokenize / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to tokenize 0 variants (0.00% of the total variants).'

In [26]:
f"The Variation Normalizer was unable to normalize {other} variants due to other issues (This is {other / total_variants * 100:.2f}% of the total variants)."

'The Variation Normalizer was unable to normalize 0 variants due to other issues (This is 0.00% of the total variants).'

## Variant queries that raised an exception during normalization

In [27]:
f"The Variation Normalizer raised an exception for {exception_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer raised an exception for 0.00% of the total variants'

# Variants we were able to normalize

In [28]:
can_normalize_total

196

In [29]:
f"The Variation Normalizer successfully normalized {can_normalize_total / should_be_able_to_normalize_total * 100:.2f}% of the variants we SHOULD be able to normalize"

'The Variation Normalizer successfully normalized 100.00% of the variants we SHOULD be able to normalize'

In [30]:
f"The Variation Normalizer successfully normalized {can_normalize_total / total_variants * 100:.2f}% of the total variants"

'The Variation Normalizer successfully normalized 43.36% of the total variants'