# GENIE Pre Variant Analysis

This data uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/#!Synapse:syn51355986). This notebook expects the `data_mutations_extended.txt` to be in the same directory.

This notebook is used to run GENIE variant data through the variation-normalizer.

In [1]:
!pip freeze | grep variation

-e git+https://github.com/cancervariants/variation-normalization.git@8b2e582b0a2bb2f6636da2d75ecabffc1388cda6#egg=variation_normalizer


In [2]:

import logging
import csv
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from variation.query import QueryHandler
from variation.schemas.service_schema import ClinVarAssembly
from tqdm import tqdm

logging.getLogger("root").setLevel(logging.WARNING)



## Create dataframe

In [3]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt", sep="\t",
    usecols=[
        "Hugo_Symbol", 
        "NCBI_Build",
        "Chromosome", 
        "Start_Position", 
        "End_Position", 
        "Reference_Allele",
        "Tumor_Seq_Allele2", 
        "HGVSp_Short",
        "dbSNP_RS"
    ]
)
# Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan
)

genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna() & ~genie_variants_df["Start_Position"].isna() & ~genie_variants_df["End_Position"].isna() & ~genie_variants_df["Reference_Allele"].isna() & ~genie_variants_df["Tumor_Seq_Allele2"].isna() & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str) + "-" + genie_variants_df["Start_Position"].astype(str) + "-" + genie_variants_df["Reference_Allele"] + "-" + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan
)

genie_variants_df = genie_variants_df[genie_variants_df["free_text_p_short"].notna()]
genie_variants_df = genie_variants_df[genie_variants_df["coordinates"].notna()]
genie_variants_df.shape

  genie_variants_df = pd.read_csv(


(1219725, 11)

In [4]:
# Create output directory
path = Path("variation_normalizer_output")
path.mkdir(exist_ok=True)

Protein (free text)

In [5]:
free_text_df = genie_variants_df.copy()
free_text_df.shape

(1219725, 11)

In [6]:
# Drop duplicates
free_text_df = free_text_df.drop_duplicates(subset=["free_text_p_short"])
free_text_df.shape

(588395, 11)

In [7]:
# Create free text df csv (if we need to debug)
free_text_df.to_csv("variation_normalizer_output/free_text_df.csv", sep="\t", index=False)

In [8]:
free_text_queries = [v for v in free_text_df["free_text_p_short"].values]

Genomic (coordinates)

In [9]:
coordinates_df = genie_variants_df.copy()
coordinates_df.shape

(1219725, 11)

In [10]:
# Drop duplicates
coordinates_df = coordinates_df.drop_duplicates(subset=["coordinates"])
coordinates_df.shape

(639707, 11)

In [11]:
# Create coordinates df csv (if we need to debug)
coordinates_df.to_csv("variation_normalizer_output/coordinates_df.csv", sep="\t", index=False)

In [12]:
coordinates_queries = [v for v in coordinates_df["coordinates"].values]

## Try using /normalize

In this section, we will run the queries through the variation normalizer.

In [13]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [14]:
query_handler = QueryHandler()

***Using Gene Database Endpoint: http://localhost:8000***


In [15]:
def translate_from_genomic(genomic_query: str) -> dict:
    """Try using vrs-python translate from using genomic query that failed to normalize"""
    resp = {
        "vrs_id": None,
        "error": None
    }
    try:
        translate_from_resp = query_handler.vrs_python_tlr.translate_from(genomic_query, assembly_name="GRCh37")
    except Exception as e:
        resp["error"] = str(e)
    else:
        resp["vrs_id"] = translate_from_resp._id._value

    return resp


In [16]:
async def normalize_genie(queries: list[str], query_type: str):
    # This file contains GENIE Variant queries that we were not able to normalize.
    unable_to_normalize_wf = open(f"variation_normalizer_output/unable_to_normalize_{query_type}_queries.csv", "w")
    unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
    unable_to_normalize_wr.writerow(["query", "exception_raised", "message", "warnings"])

    # This file contains GENIE Variant queries that we were able to normalize.
    able_to_normalize_wf = open(f"variation_normalizer_output/able_to_normalize_{query_type}_queries.csv", "w")
    able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
    able_to_normalize_wr.writerow(["query", "vrs_id", "succeeded_endpoint"])

    for query in tqdm(queries):
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(query, input_assembly=ClinVarAssembly.GRCH37)
        except Exception as e:
            warnings = [str(e)]

            if query_type == "genomic":
                genomic_resp = translate_from_genomic(query)

                if genomic_resp["vrs_id"]:
                    vrs_id = genomic_resp["vrs_id"]
                else:
                    vrs_id = None
                    warnings.append(genomic_resp["error"])

                if vrs_id:
                    able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                    continue
            
            unable_to_normalize_wr.writerow([query, True, str(e), None])
        else:
            if variation_norm_resp.variation_descriptor:
                vrs_id = variation_norm_resp.variation_descriptor.variation.id
                able_to_normalize_wr.writerow([query, vrs_id, "normalize"])
            else:
                warnings = sorted(variation_norm_resp.warnings)
                if query_type == "genomic":
                    genomic_resp = translate_from_genomic(query)

                    if genomic_resp["vrs_id"]:
                        vrs_id = genomic_resp["vrs_id"]
                    else:
                        vrs_id = None
                        warnings.append(genomic_resp["error"])

                    if vrs_id:
                        able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                        continue

                unable_to_normalize_wr.writerow(
                    [query, False, "unable to normalize", warnings]
                )

    # Close all files
    unable_to_normalize_wf.close()
    able_to_normalize_wf.close()

In [17]:
await normalize_genie(free_text_queries, "protein")

100%|██████████| 588395/588395 [48:59:57<00:00,  3.34it/s]    


In [18]:
await normalize_genie(coordinates_queries, "genomic")

100%|██████████| 639707/639707 [1:52:21<00:00, 94.89it/s]   
