# GENIE Pre Variant Analysis

This data uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/Synapse:syn68719440). This notebook expects the `data_mutations_extended.txt` to be in the same directory.

This notebook is used to run GENIE variant data through the variation-normalizer.

In [None]:
# Load environment variables. This MUST be the first cell.
from dotenv import load_dotenv

load_dotenv("../../../.env.shared")

In [1]:
import logging
import csv
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from variation.query import QueryHandler
from variation.schemas.service_schema import ClinVarAssembly
from tqdm import tqdm


log_filename = "genie-pre-variant-analysis.log"
logging.basicConfig(
    filename=log_filename,
    format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s",
)

  import pkg_resources


In [None]:
# Download cool-seq-tool files. This must be done before initializing QueryHandler
from download_cool_seq_tool_files import download_cool_seq_tool_files
download_cool_seq_tool_files(is_docker_env=False)

## Create dataframe

In [2]:
# Create output directory
path = Path("variation_normalizer_output")
path.mkdir(exist_ok=True)

In [3]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt",
    sep="\t",
    usecols=[
        "Hugo_Symbol",
        "NCBI_Build",
        "Chromosome",
        "Start_Position",
        "End_Position",
        "Tumor_Sample_Barcode",
        "Reference_Allele",
        "Tumor_Seq_Allele2",
        "HGVSp_Short",
        "dbSNP_RS",
    ],
)
# Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

# Create free text (hgvs like) query for protein representation: f"{gene}: "{p_change}"
genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan,
)

# Create gnomad vcf like query for genomic representation: f"{chromosome}-{pos}-{ref}-{alt}"
genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna()
    & ~genie_variants_df["Start_Position"].isna()
    & ~genie_variants_df["End_Position"].isna()
    & ~genie_variants_df["Reference_Allele"].isna()
    & ~genie_variants_df["Tumor_Seq_Allele2"].isna()
    & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str)
    + "-"
    + genie_variants_df["Start_Position"].astype(str)
    + "-"
    + genie_variants_df["Reference_Allele"]
    + "-"
    + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan,
)

# Create genie df csv
genie_variants_df.to_csv(
    "variation_normalizer_output/genie_variants_df.tsv", sep="\t", index=False
)

genie_variants_df.shape

  genie_variants_df = pd.read_csv(


(2738934, 12)

Protein (free text)

In [4]:
# Remove na values
free_text_df = genie_variants_df.copy()
free_text_df = free_text_df[free_text_df["free_text_p_short"].notna()]
free_text_df.shape

(2686363, 12)

In [5]:
# Drop duplicates
free_text_df = free_text_df.drop_duplicates(subset=["free_text_p_short"])
free_text_df.shape

(954230, 12)

In [6]:
# Create free text df csv (if we need to debug)
free_text_df.to_csv(
    "variation_normalizer_output/free_text_df.tsv", sep="\t", index=False
)

In [7]:
# Create list of possible protein free text queries
free_text_queries = [v for v in free_text_df["free_text_p_short"].values]

Genomic (coordinates)

In [8]:
# Remove na values
coordinates_df = genie_variants_df.copy()
coordinates_df = coordinates_df[coordinates_df["coordinates"].notna()]
coordinates_df.shape

(2397877, 12)

In [9]:
# Drop duplicates
coordinates_df = coordinates_df.drop_duplicates(subset=["coordinates"])
coordinates_df.shape

(963850, 12)

In [10]:
# Create coordinates df csv (if we need to debug)
coordinates_df.to_csv(
    "variation_normalizer_output/coordinates_df.tsv", sep="\t", index=False
)

In [11]:
# Create list of possible genomic gnomad vcf queries
coordinates_queries = [v for v in coordinates_df["coordinates"].values]

## Try using /normalize

In this section, we will run the queries through the variation normalizer.

In [12]:
# Environment variables are set for gene-normalizer dynamodb instance and
# UTA DB credentials
load_dotenv(".env.shared")

True

In [13]:
query_handler = QueryHandler()

***Using Gene Database Endpoint: http://localhost:8000***


In [14]:
def translate_from_genomic(genomic_query: str) -> dict:
    """Try using vrs-python translate from using genomic query that failed to normalize"""
    resp = {"vrs_id": None, "error": None}
    try:
        translate_from_resp = query_handler.vrs_python_tlr.translate_from(
            genomic_query, assembly_name="GRCh37"
        )
    except Exception as e:
        resp["error"] = str(e)
    else:
        resp["vrs_id"] = translate_from_resp.id

    return resp

In [15]:
async def normalize_genie(queries: list[str], query_type: str):
    # This file contains GENIE Variant queries that we were not able to normalize.
    unable_to_normalize_wf = open(
        f"variation_normalizer_output/unable_to_normalize_{query_type}_queries.tsv", "w"
    )
    unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
    unable_to_normalize_wr.writerow(
        ["query", "exception_raised", "message", "warnings"]
    )

    # This file contains GENIE Variant queries that we were able to normalize.
    able_to_normalize_wf = open(
        f"variation_normalizer_output/able_to_normalize_{query_type}_queries.tsv", "w"
    )
    able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
    able_to_normalize_wr.writerow(["query", "vrs_id", "succeeded_endpoint"])

    for query in tqdm(queries):
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(
                query, input_assembly=ClinVarAssembly.GRCH37
            )
        except Exception as e:
            warnings = [str(e)]

            # If genomic, try not lifting over to see if we can normalizer
            if query_type == "genomic":
                genomic_resp = translate_from_genomic(query)

                if genomic_resp["vrs_id"]:
                    vrs_id = genomic_resp["vrs_id"]
                else:
                    vrs_id = None
                    warnings.append(genomic_resp["error"])

                if vrs_id:
                    able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                    continue

            unable_to_normalize_wr.writerow([query, True, str(e), None])
        else:
            if variation_norm_resp.variation:
                vrs_id = variation_norm_resp.variation.id
                able_to_normalize_wr.writerow([query, vrs_id, "normalize"])
            else:
                warnings = sorted(variation_norm_resp.warnings)

                # If genomic, try not lifting over to see if we can normalizer
                if query_type == "genomic":
                    genomic_resp = translate_from_genomic(query)

                    if genomic_resp["vrs_id"]:
                        vrs_id = genomic_resp["vrs_id"]
                    else:
                        vrs_id = None
                        warnings.append(genomic_resp["error"])

                    if vrs_id:
                        able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                        continue

                unable_to_normalize_wr.writerow(
                    [query, False, "unable to normalize", warnings]
                )

    # Close all files
    unable_to_normalize_wf.close()
    able_to_normalize_wf.close()

In [16]:
await normalize_genie(free_text_queries, "protein")

  0%|          | 223/954230 [01:56<138:46:19,  1.91it/s]


CancelledError: 

In [17]:
await normalize_genie(coordinates_queries, "genomic")

  0%|          | 2063/963850 [01:19<10:15:26, 26.05it/s]


CancelledError: 