# GENIE Search Analysis

This notebook performs set intersections to determine matches between normalized variants from GENIE and normalized variants from CIViC, Molecular Oncology Almanac, and ClinVar.

## Prerequisites
The following notebook must be run before running this analysis: 

- `genie/pre_variant_analysis/genie_pre_variant_analysis.ipynb`

This notebook uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/#!Synapse:syn51355986). This notebook expects the `data_mutations_extended.txt` to be in the same directory.

In [1]:
# Import relevant libraries
import ndjson
import numpy as np
import pandas as pd
from pathlib import Path
from botocore.config import Config
import boto3
import gzip
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

## Load Data (CIViC, Molecular Oncology Almanac, ClinVar, GENIE)

In [3]:
# Load normalized variants for CIViC
civic_variants = pd.read_csv("../../civic/variation_analysis/able_to_normalize_queries.csv", sep="\t")

In [4]:
# Load normalized variants for MOA
moa_variants = pd.read_csv("../../moa/feature_analysis/able_to_normalize_queries.csv", sep="\t")

In [5]:
# Load normalized variants for ClinVar. Please sign in through `aws sso login` before running the cell.
s3 = boto3.resource("s3", config=Config(region_name="us-east-2"))
bucket = s3.Bucket("nch-igm-wagner-lab-public").objects.filter(Prefix="variation-normalizer-manuscript/output-variation_identity-vrs-1.3.ndjson.gz")

for file in bucket:
    fn = file.key.split("/")[-1]
    with open(fn, "wb") as f:
        file.Object().download_fileobj(f)

with gzip.open('output-variation_identity-vrs-1.3.ndjson.gz', 'rb') as f:
    records = ndjson.load(f)

df0 = pd.json_normalize(records)
clinvar_variants = df0.copy()

In [6]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt", sep="\t",
    usecols=[
        "Hugo_Symbol", 
        "NCBI_Build",
        "Chromosome", 
        "Start_Position", 
        "End_Position", 
        "Tumor_Sample_Barcode",
        "Reference_Allele",
        "Tumor_Seq_Allele2", 
        "HGVSp_Short",
        "dbSNP_RS"
    ]
)
#Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan
)

genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna() & ~genie_variants_df["Start_Position"].isna() & ~genie_variants_df["End_Position"].isna() & ~genie_variants_df["Reference_Allele"].isna() & ~genie_variants_df["Tumor_Seq_Allele2"].isna() & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str) + "-" + genie_variants_df["Start_Position"].astype(str) + "-" + genie_variants_df["Reference_Allele"] + "-" + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan
)

genie_variants_df.shape

  genie_variants_df = pd.read_csv(


(1432052, 12)

### Normalized Genomic and Protein GENIE variants 
Due to GENIE licenses, please run the analysis notebook in `genie/pre_variant_analysis` to generate the datasets below.

In [7]:
genie_genomic_normalized = pd.read_csv("able_to_normalize_genomic_queries.csv",sep="\t")

In [8]:
genie_protein_normalized = pd.read_csv("able_to_normalize_protein_queries.csv",sep="\t")

## Create Output Directory

In [9]:
path = Path("variant_analysis_output")
path.mkdir(exist_ok = True)

## Variant-Level Analysis
In this section we perform set intersections to determine how many normalized variants from GENIE match to MOA, CIViC, and ClinVar.

In [11]:
# Helper functions
def write_to_file(filename: str, data: list) -> None:
    with open(filename, "w") as f:
        for match in data:
            f.write(match + "\n")

def check_kb(var, kb):
    return True if var in kb else False

## MOA

In [12]:
# Intersect normalized GENIE protein variants with normalized MOA protein variants
genie_moa_match = set(genie_protein_normalized["vrs_id"].to_list()).intersection(set(moa_variants["vrs_id"].to_list()))
write_to_file("variant_analysis_output/genie_moa_match.txt", list(genie_moa_match))
f"The number of matched protein variants in MOA is: {len(genie_moa_match)}"

'The number of matched protein variants in MOA is: 122'

## CIViC

In [13]:
# Intersect normalized GENIE genomic variants with normalized CIViC variants
genie_civic_match_genomic = set(genie_genomic_normalized["vrs_id"].to_list()).intersection(set(civic_variants["vrs_id"].to_list()))
write_to_file("variant_analysis_output/genie_civic_match_genomic.txt", genie_civic_match_genomic)
f"The number of matched genomic variants in CIViC is: {len(genie_civic_match_genomic)}"

'The number of matched genomic variants in CIViC is: 203'

In [14]:
# Intersect normalized GENIE protein variants with normalized CIViC variants
genie_civic_match_protein = set(genie_protein_normalized["vrs_id"].to_list()).intersection(set(civic_variants["vrs_id"].to_list()))
write_to_file("variant_analysis_output/genie_civic_match_protein.txt", genie_civic_match_protein)
f"The number of matched protein variants in CIViC is: {len(genie_civic_match_protein)}"

'The number of matched protein variants in CIViC is: 944'

In [15]:
# Inspect union of GENIE genomic and protein variants
civic_genie_union = genie_civic_match_genomic.union(genie_civic_match_protein)
write_to_file("variant_analysis_output/genie_civic_match_union.txt", civic_genie_union)
f"The number of matched GENIE variants in CIViC is: {len(civic_genie_union)}"

'The number of matched GENIE variants in CIViC is: 1147'

## ClinVar

In [16]:
# Intersect normalized GENIE genomic variants with normalized ClinVar variants
genie_clinvar_match_genomic = set(genie_genomic_normalized["vrs_id"].to_list()).intersection(set(clinvar_variants["out._id"].to_list()))
write_to_file("variant_analysis_output/genie_clinvar_match_genomic.txt", genie_clinvar_match_genomic)
f"The number of matched GENIE genomic variants in ClinVar is: {len(genie_clinvar_match_genomic)}"

'The number of matched GENIE genomic variants in ClinVar is: 94028'

In [17]:
# Intersect normalized GENIE protein variants with normalized ClinVar variants
genie_clinvar_match_protein = set(genie_protein_normalized["vrs_id"].to_list()).intersection(set(clinvar_variants["out._id"].to_list()))
f"The number of matched GENIE protein variants in ClinVar is: {len(genie_clinvar_match_protein)}"

'The number of matched GENIE protein variants in ClinVar is: 0'

## Patient-Level Analysis
In this section we summarize variant matching at the patient level, with >= 1 shared variant counting as a match.

### Filter to include normalized genomic and protein variants

In [18]:
genomic_variants_dict = {}
for i in range(len(genie_genomic_normalized)):
    if genie_genomic_normalized.at[i, "query"] not in genomic_variants_dict:
        genomic_variants_dict[genie_genomic_normalized.at[i, "query"]] = genie_genomic_normalized.at[i, "vrs_id"]

In [19]:
# Add genomic vrs_ids to genie_variants_df
l = []
queries = genie_variants_df["coordinates"].to_list()
for i in range(len(queries)):
    if queries[i] in genomic_variants_dict:
        l.append(genomic_variants_dict[queries[i]])
    else:
        l.append(np.nan)
genie_variants_df["vrs_id_genomic"] = l
genie_variants_genomic = genie_variants_df[genie_variants_df["vrs_id_genomic"].notna()]

In [20]:
protein_variants_dict = {}
for i in range(len(genie_protein_normalized)):
    if genie_protein_normalized.at[i, "query"] not in protein_variants_dict:
        protein_variants_dict[genie_protein_normalized.at[i, "query"]] = genie_protein_normalized.at[i, "vrs_id"]

In [21]:
# Add protein vrs_ids to genie_variants_df
l = []
queries = genie_variants_df["free_text_p_short"].to_list()
for i in range(len(queries)):
    if queries[i] in protein_variants_dict:
        l.append(protein_variants_dict[queries[i]])
    else:
        l.append(np.nan)
genie_variants_df["vrs_id_protein"] = l
genie_variants_protein = genie_variants_df[genie_variants_df["vrs_id_protein"].notna()]

In [22]:
# Add columns indicating which variants are in which knowledgebase
genie_variants_df['in_civic_genomic'] = genie_variants_df['vrs_id_genomic'].map(lambda x: check_kb(x, genie_civic_match_genomic))
genie_variants_df['in_clinvar'] = genie_variants_df['vrs_id_genomic'].map(lambda x: check_kb(x, genie_clinvar_match_genomic))
genie_variants_df['in_civic_protein'] = genie_variants_df['vrs_id_protein'].map(lambda x: check_kb(x, genie_civic_match_protein))
genie_variants_df['in_moa'] = genie_variants_df['vrs_id_protein'].map(lambda x: check_kb(x, genie_moa_match))

In [23]:
# Create genomic variants patient dictionary
pvd_genomic = dict()
for index, row in genie_variants_genomic.iterrows():
    key = row["Tumor_Sample_Barcode"]
    var = row["vrs_id_genomic"]
    if key in pvd_genomic:
        curr = pvd_genomic[key]
        if pd.isna(var) == False:
            curr.append(var)
        pvd_genomic[key] = curr
    else:
        if pd.isna(var) == False:
            pvd_genomic[key]= [var]

In [24]:
# Create protein variants patient dictionary
pvd_protein = dict()
for index, row in genie_variants_protein.iterrows():
    key = row["Tumor_Sample_Barcode"]
    var = row["vrs_id_protein"]
    if key in pvd_protein:
        curr = pvd_protein[key]
        if pd.isna(var) == False:
            curr.append(var)
        pvd_protein[key] = curr
    else:
        if pd.isna(var) == False:
            pvd_protein[key]= [var]

### MOA

In [25]:
count = 0
patients = []
for key in pvd_protein:
    isect = set(pvd_protein[key]).intersection(genie_moa_match)
    if len(isect) >= 1:
        count += 1
        patients.append(key)
write_to_file("variant_analysis_output/moa_patient_match.txt", patients)
f"There are {count} patients with matching protein variant data in MOA"

'There are 41828 patients with matching protein variant data in MOA'

### CIViC

In [26]:
# Protein variants patient count
count = 0
patients_protein = []
for key in pvd_protein:
    isect = set(pvd_protein[key]).intersection(genie_civic_match_protein)
    if len(isect) >= 1:
        count += 1
        patients_protein.append(key)
write_to_file("variant_analysis_output/civic_protein_patient_match.txt", patients_protein)
f"There are {count} patients with matching protein variant data in CIViC"

'There are 84673 patients with matching protein variant data in CIViC'

In [27]:
# Genomic variants patient count
count = 0
patients_genomic = []
for key in pvd_genomic:
    isect = set(pvd_genomic[key]).intersection(genie_civic_match_genomic)
    if len(isect) >= 1:
        count += 1
        patients_genomic.append(key)
write_to_file("variant_analysis_output/civic_genomic_patient_match.txt", patients_genomic)
f"There are {count} patients with matching genomic variant data in CIViC"

'There are 853 patients with matching genomic variant data in CIViC'

In [28]:
# Compute number of patients with both genomic and protein variant overlap
len(set(patients_genomic).intersection(set(patients_protein)))

272

### ClinVar

In [29]:
# Genomic variants patient count
count = 0
patients_genomic = []
for key in pvd_genomic:
    isect = set(pvd_genomic[key]).intersection(genie_clinvar_match_genomic)
    if len(isect) >= 1:
        count += 1
        patients_genomic.append(key)
write_to_file("variant_analysis_output/clinvar_patient_match.txt", patients_genomic)
f"There are {count} patients with matching genomic variant data in ClinVar"

'There are 126235 patients with matching genomic variant data in ClinVar'

## Variant Counts per Patient
Compute the average number of genomic and protein variants per patient.

In [30]:
counts = []
for key in pvd_genomic:
    counts.append(len(pvd_genomic[key]))
sum(counts)/len(counts)

8.565950208752126

In [31]:
counts = []
for key in pvd_protein:
    counts.append(len(pvd_protein[key]))
sum(counts)/len(counts)

7.838487315182816

In [32]:
patient_vars = dict()
for index, row in genie_variants_df.iterrows():
    key = row["Tumor_Sample_Barcode"]
    var_g = row["vrs_id_genomic"]
    var_p = row["vrs_id_protein"]
    if key in patient_vars:
        if pd.isna(var_g) == True and pd.isna(var_p) == True:
            continue
        curr = patient_vars[key]
        curr += 1
        patient_vars[key] = curr
    else:
        if pd.isna(var_g) == True and pd.isna(var_p) == True:
            continue
        patient_vars[key] = 1

In [33]:
# Average number of normalized variants per patient
counts = []
for key in patient_vars:
    counts.append(patient_vars[key])
sum(counts)/len(counts)

8.581963247821301

In [41]:
# Average number of total variants per patient
n_variants = list(genie_variants_df.groupby('Tumor_Sample_Barcode').size())
sum(n_variants)/len(n_variants)

9.75133633398476

In [45]:
def kb_match(df):
    n_variants = list(df.groupby('Tumor_Sample_Barcode').size())
    return sum(n_variants)/len(n_variants)

In [46]:
# Normalized matched genomic variants per patient in CIViC
genie_df_civic_genomic = genie_variants_df[genie_variants_df['in_civic_genomic'] == True]
kb_match(genie_df_civic_genomic)

1.010550996483001

In [47]:
# Normalized matched protein variants per patient in CIViC
genie_df_civic_protein = genie_variants_df[genie_variants_df['in_civic_protein'] == True]
kb_match(genie_df_civic_protein)

1.379766867832721

In [48]:
# Normalized matched protein variants per patient in MOA
genie_df_moa = genie_variants_df[genie_variants_df['in_moa'] == True]
kb_match(genie_df_moa)

1.0997417997513628

In [49]:
# Normalized matched genomic variants per patient in ClinVar
genie_df_clinvar = genie_variants_df[genie_variants_df['in_clinvar'] == True]
kb_match(genie_df_clinvar)

3.1800609973462195

In [63]:
# Normalized matching across knowlegebases, genomic
genie_df_genomic_combined = genie_variants_df[(genie_variants_df['in_civic_genomic'] == True) | (genie_variants_df['in_clinvar'] == True)]
kb_match(genie_df_genomic_combined)

3.179842726703992

In [64]:
# Normalized matching across knowlegebases, protein
genie_df_protein_combined = genie_variants_df[(genie_variants_df['in_civic_protein'] == True) | (genie_variants_df['in_moa'] == True)]
kb_match(genie_df_protein_combined)

1.3912535079513564

In [65]:
# Nomralized matching across all knowledgebases
genie_df_filtered= genie_variants_df[(genie_variants_df['in_civic_protein'] == True) | (genie_variants_df['in_moa'] == True) |
(genie_variants_df['in_civic_genomic'] == True) | (genie_variants_df['in_clinvar'] == True)]
kb_match(genie_df_filtered)

3.203230538710766