# GENIE Analysis

This data uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/#!Synapse:syn51355986). This notebook expects the `data_mutations_extended.txt` to be in the same directory.

In [1]:

import logging
import csv
from datetime import datetime

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from variation.query import QueryHandler
from variation.schemas.service_schema import ClinVarAssembly
from tqdm import tqdm

logging.getLogger("root").setLevel(logging.WARNING)



## Create dataframe

In [2]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt", sep="\t",
    usecols=[
        "Hugo_Symbol", 
        "NCBI_Build",
        "Chromosome", 
        "Start_Position", 
        "End_Position", 
        "Reference_Allele",
        "Tumor_Seq_Allele2", 
        "HGVSp_Short",
        "dbSNP_RS"
    ]
)
# Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan
)

genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna() & ~genie_variants_df["Start_Position"].isna() & ~genie_variants_df["End_Position"].isna() & ~genie_variants_df["Reference_Allele"].isna() & ~genie_variants_df["Tumor_Seq_Allele2"].isna() & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str) + "-" + genie_variants_df["Start_Position"].astype(str) + "-" + genie_variants_df["Reference_Allele"] + "-" + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan
)

genie_variants_df = genie_variants_df[genie_variants_df["free_text_p_short"].notna()]
genie_variants_df = genie_variants_df[genie_variants_df["coordinates"].notna()]
genie_variants_df.shape

  genie_variants_df = pd.read_csv(


(1219725, 11)

In [3]:
free_text_df = genie_variants_df["free_text_p_short"].copy()
free_text_df.shape

(1219725,)

In [4]:
# Get duplicates
free_text_dups = free_text_df.loc[free_text_df.duplicated()]
free_text_dups.shape

(631330,)

In [5]:

# Drop duplicates
free_text_df = free_text_df.drop_duplicates()
free_text_queries = [v for v in free_text_df.values]
len(free_text_queries)


588395

In [6]:
coordinates_df = genie_variants_df["coordinates"].copy()
coordinates_df.shape

(1219725,)

In [7]:
# Get duplicates
coord_dups = coordinates_df.loc[coordinates_df.duplicated()]
coord_dups.shape

(580018,)

In [8]:

# Drop duplicates
coordinates_df = coordinates_df.drop_duplicates()
coordinate_queries = [v for v in coordinates_df.values]
len(coordinate_queries)

639707

## Try using /normalize

In this section, we will run the queries through the variation normalizer.

In [9]:
# Environment variables are set for gene-normalizer dynamodb instance and 
# UTA DB credentials
load_dotenv()

True

In [10]:
query_handler = QueryHandler()

***Using Gene Database Endpoint: http://localhost:8000***


In [11]:
async def normalize_genie(queries: list[str], query_type: str):
    # This file contains GENIE Variant queries that we were not able to normalize.
    unable_to_normalize_wf = open(f"unable_to_normalize_{query_type}_queries.csv", "w")
    unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
    unable_to_normalize_wr.writerow(["query", "exception_raised", "message", "warnings"])

    # This file contains GENIE Variant queries that we were able to normalize.
    able_to_normalize_wf = open(f"able_to_normalize_{query_type}_queries.csv", "w")
    able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
    able_to_normalize_wr.writerow(["query", "vrs_id"])

    for query in tqdm(queries):
        try:
            variation_norm_resp = await query_handler.normalize_handler.normalize(query, input_assembly=ClinVarAssembly.GRCH37)
        except Exception as e:
            unable_to_normalize_wr.writerow([query, True, str(e), None])
        else:
            if variation_norm_resp.variation_descriptor:
                vrs_id = variation_norm_resp.variation_descriptor.variation.id
                able_to_normalize_wr.writerow([query, vrs_id])
            else:
                unable_to_normalize_wr.writerow(
                    [query, False, "unable to normalize", variation_norm_resp.warnings]
                )

    # Close all files
    unable_to_normalize_wf.close()
    able_to_normalize_wf.close()

In [12]:
await normalize_genie(coordinates_df.values, "genomic")

100%|██████████| 639707/639707 [1:46:37<00:00, 100.00it/s] 


In [13]:
# await normalize_genie(free_text_queries, "protein")

In [14]:
# async def normalize_genie_existing(queries: list[str], query_type: str):
#     """If not finished normalizing...."""
#     date = datetime.today().strftime("%Y%m%d")
    
#     # This file contains GENIE Variant queries that we were not able to normalize.
#     unable_to_normalize_wf = open(f"unable_to_normalize_{query_type}_queries_{date}.csv", "w")
#     unable_to_normalize_wr = csv.writer(unable_to_normalize_wf, delimiter="\t")
#     unable_to_normalize_wr.writerow(["query", "exception_raised", "message", "warnings"])

#     # This file contains GENIE Variant queries that we were able to normalize.
#     able_to_normalize_wf = open(f"able_to_normalize_{query_type}_queries_{date}.csv", "w")
#     able_to_normalize_wr = csv.writer(able_to_normalize_wf, delimiter="\t")
#     able_to_normalize_wr.writerow(["query", "vrs_id"])

#     able_to_normalize_rf = open(f"existing/able_to_normalize_protein_queries.csv", "r")
#     unable_to_normalize_rf = open(f"existing/unable_to_normalize_protein_queries.csv", "r")

#     for query in tqdm(queries):
        
#         try:
#             variation_norm_resp = await query_handler.normalize_handler.normalize(query)
#         except Exception as e:
#             unable_to_normalize_wr.writerow([query, True, str(e), None])
#         else:
#             if variation_norm_resp.variation_descriptor:
#                 vrs_id = variation_norm_resp.variation_descriptor.variation.id
#                 able_to_normalize_wr.writerow([query, vrs_id])
#             else:
#                 unable_to_normalize_wr.writerow(
#                     [query, False, "unable to normalize", variation_norm_resp.warnings]
#                 )

#     # Close all files
#     unable_to_normalize_wf.close()
#     able_to_normalize_wf.close()
#     able_to_normalize_rf.close()
#     unable_to_normalize_rf.close()
    