# GENIE Analysis

This data uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/#!Synapse:syn51355986). This notebook expects the `data_mutations_extended.txt` to be in the same directory.

In [None]:
!pip freeze | grep variation

In [None]:

import logging
import csv
from datetime import datetime

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from variation.query import QueryHandler
from variation.schemas.service_schema import ClinVarAssembly
from tqdm import tqdm

logging.getLogger("root").setLevel(logging.WARNING)

## Create dataframe

In [None]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt", sep="\t",
    usecols=[
        "Hugo_Symbol", 
        "NCBI_Build",
        "Chromosome", 
        "Start_Position", 
        "End_Position", 
        "Reference_Allele",
        "Tumor_Seq_Allele2", 
        "HGVSp_Short",
        "dbSNP_RS"
    ]
)
# Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan
)

genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna() & ~genie_variants_df["Start_Position"].isna() & ~genie_variants_df["End_Position"].isna() & ~genie_variants_df["Reference_Allele"].isna() & ~genie_variants_df["Tumor_Seq_Allele2"].isna() & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str) + "-" + genie_variants_df["Start_Position"].astype(str) + "-" + genie_variants_df["Reference_Allele"] + "-" + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan
)

genie_variants_df = genie_variants_df[genie_variants_df["free_text_p_short"].notna()]
genie_variants_df = genie_variants_df[genie_variants_df["coordinates"].notna()]
genie_variants_df.shape

In [None]:
free_text_df = genie_variants_df["free_text_p_short"].copy()
free_text_df.shape

In [None]:
# Get duplicates
free_text_dups = free_text_df.loc[free_text_df.duplicated()]
free_text_dups.shape

In [None]:

# Drop duplicates
free_text_df = free_text_df.drop_duplicates()
free_text_queries = [v for v in free_text_df.values]
len(free_text_queries)


In [None]:
coordinates_df = genie_variants_df["coordinates"].copy()
coordinates_df.shape

In [None]:
# Get duplicates
coord_dups = coordinates_df.loc[coordinates_df.duplicated()]
coord_dups.shape

In [None]:

# Drop duplicates
coordinates_df = coordinates_df.drop_duplicates()
coordinate_queries = [v for v in coordinates_df.values]
len(coordinate_queries)

## Try using /normalize

In this section, we will run the queries through the variation normalizer.

In [None]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

In [None]:
query_handler = QueryHandler()

In [None]:
def translate_from_genomic(genomic_query: str) -> dict:
    """Try using vrs-python translate from using genomic query that failed to normalize"""
    resp = {
        "vrs_id": None,
        "error": None
    }
    try:
        translate_from_resp = query_handler.vrs_python_tlr.translate_from(genomic_query, assembly_name="GRCh37")
    except Exception as e:
        resp["error"] = str(e)
    else:
        resp["vrs_id"] = translate_from_resp._id._value

    return resp


In [None]:
async def normalize_genie(queries: list[str], query_type: str):
    # This file contains GENIE Variant queries that we were not able to normalize.
    unable_to_normalize_wf = open(f"unable_to_normalize_{query_type}_queries.csv", "w")
    unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
    unable_to_normalize_wr.writerow(["query", "exception_raised", "message", "warnings"])

    # This file contains GENIE Variant queries that we were able to normalize.
    able_to_normalize_wf = open(f"able_to_normalize_{query_type}_queries.csv", "w")
    able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
    able_to_normalize_wr.writerow(["query", "vrs_id", "succeeded_endpoint"])

    for query in tqdm(queries):
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(query, input_assembly=ClinVarAssembly.GRCH37)
        except Exception as e:
            warnings = [str(e)]

            if query_type == "genomic":
                genomic_resp = translate_from_genomic(query)

                if genomic_resp["vrs_id"]:
                    vrs_id = genomic_resp["vrs_id"]
                else:
                    vrs_id = None
                    warnings.append(genomic_resp["error"])

                if vrs_id:
                    able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                    continue
            
            unable_to_normalize_wr.writerow([query, True, str(e), None])
        else:
            if variation_norm_resp.variation_descriptor:
                vrs_id = variation_norm_resp.variation_descriptor.variation.id
                able_to_normalize_wr.writerow([query, vrs_id, "normalize"])
            else:
                if query_type == "genomic":
                  genomic_resp = translate_from_genomic(query)

                    if genomic_resp["vrs_id"]:
                        vrs_id = genomic_resp["vrs_id"]
                    else:
                        vrs_id = None
                        warnings.append(genomic_resp["error"])

                    if vrs_id:
                        able_to_normalize_wr.writerow([query, vrs_id, "translate_from"])
                        continue

                unable_to_normalize_wr.writerow(
                    [query, False, "unable to normalize", sorted(variation_norm_resp.warnings)]
                )

    # Close all files
    unable_to_normalize_wf.close()
    able_to_normalize_wf.close()

In [None]:
await normalize_genie(coordinates_df.values, "genomic")

In [None]:
# await normalize_genie(free_text_queries, "protein")

## Run cells with existing CSVs

This section includes cells to run with existing CSVs. We have this section since normalization takes a long time. 

In [None]:
# Add new column
df = pd.read_csv("able_to_normalize_genomic_queries.csv", delimiter="\t")
df["succeeded_endpoint"] = "normalize"
df.to_csv("able_to_normalize_genomic_queries.csv", sep="\t", index=False)

In [None]:
unable_to_norm_updated_rows = []

with open("able_to_normalize_genomic_queries.csv", "a") as a_wf:
    d = csv.DictWriter(a_wf, fieldnames=["query", "vrs_id", "succeeded_endpoint"], delimiter="\t")
    with open("unable_to_normalize_genomic_queries.csv", "r") as u_rf:
        unable_to_norm = csv.reader(u_rf, delimiter="\t")
        header = next(unable_to_norm)
        unable_to_norm_updated_rows.append(header)

        for unable_to_norm_row in unable_to_norm:
            query, exception_raised, message, warnings = unable_to_norm_row
            warnings = eval(warnings)

            genomic_resp = translate_from_genomic(query)

            if genomic_resp["vrs_id"]:
                d.writerow({"query": query, "vrs_id": genomic_resp["vrs_id"], "succeeded_endpoint": "translate_from"})
            else:
                warnings.append(genomic_resp["error"])
                unable_to_norm_updated_rows.append([query, exception_raised, message, warnings])


with open("unable_to_normalize_genomic_queries.csv", "a") as f:
    wr = csv.writer(f, delimiter="\t")
    wr.writerows(unable_to_norm_updated_rows)