# GENIE Search Analysis
This notebook performs set intersections to determine matches between normalized variants from GENIE and normalized variants from CIViC, Molecular Oncology Almanac, and ClinVar.

In [1]:
# Import relevant libraries
import logging
import csv
from datetime import datetime
import ndjson
import numpy as np
import pandas as pd
from pathlib import Path
from boto3.exceptions import ResourceLoadException
from botocore.config import Config
import boto3
import gzip
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

## Load Data (CIViC, Molecular Oncology Almanac, ClinVar)

In [3]:
# Load normalized variants for CIViC, MOA, and ClinVar.
civic_variants = pd.read_csv("../../civic/variation_analysis/able_to_normalize_queries.csv", sep="\t")
moa_variants = pd.read_csv("../../moa/feature_analysis/able_to_normalize_queries.csv", sep="\t")

#to refresh SSO session, run aws sso login
s3 = boto3.resource("s3", config=Config(region_name="us-east-2"))
bucket = s3.Bucket("nch-igm-wagner-lab-public").objects.filter(Prefix="variation-normalizer-manuscript/output-variation_identity-vrs-1.3.ndjson.gz")

for file in bucket:
    fn = file.key.split("/")[-1]
    with open(fn, "wb") as f:
        file.Object().download_fileobj(f)

with gzip.open('output-variation_identity-vrs-1.3.ndjson.gz', 'rb') as f:
    records = ndjson.load(f)

df0 = pd.json_normalize(records)
clinvar_variants = df0.copy()

In [4]:
# Get GENIE Variant data
genie_variants_df = pd.read_csv(
    "data_mutations_extended.txt", sep="\t",
    usecols=[
        "Hugo_Symbol", 
        "NCBI_Build",
        "Chromosome", 
        "Start_Position", 
        "End_Position", 
        "Tumor_Sample_Barcode",
        "Reference_Allele",
        "Tumor_Seq_Allele2", 
        "HGVSp_Short",
        "dbSNP_RS"
    ]
)
#Some positions are '-', we drop nan below
genie_variants_df = genie_variants_df.replace({"-": np.nan})

genie_variants_df["free_text_p_short"] = np.where(
    ~genie_variants_df["Hugo_Symbol"].isna() & ~genie_variants_df["HGVSp_Short"].isna(),
    genie_variants_df["Hugo_Symbol"] + " " + genie_variants_df["HGVSp_Short"],
    np.nan
)

genie_variants_df["coordinates"] = np.where(
    ~genie_variants_df["Chromosome"].isna() & ~genie_variants_df["Start_Position"].isna() & ~genie_variants_df["End_Position"].isna() & ~genie_variants_df["Reference_Allele"].isna() & ~genie_variants_df["Tumor_Seq_Allele2"].isna() & ~genie_variants_df["NCBI_Build"].isna(),
    genie_variants_df["Chromosome"].astype(str) + "-" + genie_variants_df["Start_Position"].astype(str) + "-" + genie_variants_df["Reference_Allele"] + "-" + genie_variants_df["Tumor_Seq_Allele2"],
    np.nan
)

genie_variants_df = genie_variants_df[genie_variants_df["free_text_p_short"].notna()]
genie_variants_df = genie_variants_df[genie_variants_df["coordinates"].notna()]
genie_variants_df.shape

  genie_variants_df = pd.read_csv(


(1219725, 12)

In [5]:
genie_variants_df.head()

Unnamed: 0,Hugo_Symbol,NCBI_Build,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,dbSNP_RS,Tumor_Sample_Barcode,HGVSp_Short,free_text_p_short,coordinates
0,KRAS,GRCh37,12,25398285,25398285,C,A,rs121913530,GENIE-JHU-00006-00185,p.G12C,KRAS p.G12C,12-25398285-C-A
1,BRAF,GRCh37,7,140453136,140453136,A,T,rs113488022,GENIE-JHU-00006-00185,p.V600E,BRAF p.V600E,7-140453136-A-T
2,EGFR,GRCh37,7,55249071,55249071,C,T,rs121434569,GENIE-JHU-00006-00185,p.T790M,EGFR p.T790M,7-55249071-C-T
3,TP53,GRCh37,17,7577120,7577120,C,T,rs28934576,GENIE-JHU-00006-00185,p.R273H,TP53 p.R273H,17-7577120-C-T
4,NRAS,GRCh37,1,115256529,115256529,T,C,rs11554290,GENIE-JHU-00006-00185,p.Q61R,NRAS p.Q61R,1-115256529-T-C


### Normalized Genomic and Protein GENIE variants. 
Due to GENIE licenses, please run the analysis notebook in genie/pre_variant_analysis to generate the datasets below.

In [6]:
able_to_normalize_genomic = pd.read_csv("able_to_normalize_genomic_queries.csv",sep="\t")
able_to_normalize_genomic.head()

Unnamed: 0,query,vrs_id,succeeded_endpoint
0,12-25398285-C-A,ga4gh:VA.29aaPq9c0e1lwJWtXIkgtLzN8cv1xxAe,normalize
1,7-140453136-A-T,ga4gh:VA.fZiBjQEolbkL0AxjoTZf4SOkFy9J0ebU,normalize
2,7-55249071-C-T,ga4gh:VA.1ewlywoD423K7YH_K4YefZg6J_87pQTp,normalize
3,17-7577120-C-T,ga4gh:VA.AB-I5SlEJvtzl4BbOetee0MMtItNsmFC,normalize
4,1-115256529-T-C,ga4gh:VA.6uUGtcvxhZGl7lisrIOPRqkpD5jOcfXl,normalize


In [7]:
able_to_normalize_protein = pd.read_csv("able_to_normalize_protein_queries.csv",sep="\t")
able_to_normalize_protein.head()

Unnamed: 0,query,vrs_id,succeeded_endpoint
0,KRAS p.G12C,ga4gh:VA.GtaY-fkmnMXM-bRKyu5qvya6Zd47AM_X,normalize
1,BRAF p.V600E,ga4gh:VA.ZDdoQdURgO2Daj2NxLj4pcDnjiiAsfbO,normalize
2,EGFR p.T790M,ga4gh:VA.BldHTcxmxpKf1exsSbeuki1jhdek1GaJ,normalize
3,TP53 p.R273H,ga4gh:VA.kcPAoam8e66opWWbjcHmCNQ7DbgOV1Uq,normalize
4,NRAS p.Q61R,ga4gh:VA.UtLI1rrsdnYpKO9B6xHJBsUHmJ6tRrYn,normalize


## Create Output Directory

In [8]:
path = Path("variant_analysis_output")
path.mkdir(exist_ok = True)

## Variant-Level Analysis
In this section we perform set intersections to determine how many normalized variants from GENIE match to MOA, CIViC, and ClinVar.

## MOA

In [9]:
# Intersect normalized GENIE protein variants with normalized MOA protein variants
genie_moa_match = set(able_to_normalize_protein["vrs_id"].to_list()).intersection(set(moa_variants["vrs_id"].to_list()))
print(f"The number of matched protein variants in MOA is: {len(genie_moa_match)}")
with open("variant_analysis_output/genie_moa_match.txt", "w") as f:
    genie_moa_match = list(genie_moa_match)
    for i in range(len(genie_moa_match)):
        f.write(genie_moa_match[i] + "\n")
f.close()

The number of matched protein variants in MOA is: 122


## CIViC

In [10]:
# Intersect normalized GENIE genomic and protein variants with normalized CIViC variants
genie_civic_match_genomic = set(able_to_normalize_genomic["vrs_id"].to_list()).intersection(set(civic_variants["vrs_id"].to_list()))
print(f"The number of matched genomic variants in CIViC is: {len(genie_civic_match_genomic)}")
with open("variant_analysis_output/genie_civic_match_genomic.txt", "w") as f:
    genie_civic_match_g = list(genie_civic_match_genomic)
    for i in range(len(genie_civic_match_g)):
        f.write(genie_civic_match_g[i] + "\n")
f.close()

genie_civic_match_protein = set(able_to_normalize_protein["vrs_id"].to_list()).intersection(set(civic_variants["vrs_id"].to_list()))
print(f"The number of matched protein variants in CIViC is: {len(genie_civic_match_protein)}")
with open("variant_analysis_output/genie_civic_match_protein.txt", "w") as f:
    genie_civic_match_p = list(genie_civic_match_protein)
    for i in range(len(genie_civic_match_p)):
        f.write(genie_civic_match_p[i] + "\n")
f.close()

civic_genie_union = genie_civic_match_genomic.union(genie_civic_match_protein)
print(f"The number of matched GENIE variants in CIViC is: {len(civic_genie_union)}")
with open("variant_analysis_output/genie_civic_match_union.txt", "w") as f:
    genie_civic_match_union = list(civic_genie_union)
    for i in range(len(genie_civic_match_union)):
        f.write(genie_civic_match_union[i] + "\n")
f.close()

The number of matched genomic variants in CIViC is: 203
The number of matched protein variants in CIViC is: 944
The number of matched GENIE variants in CIViC is: 1147


## ClinVar

In [11]:
# Intersect normalized GENIE genomic variants with normalized ClinVar variants
genie_clinvar_match_genomic = set(able_to_normalize_genomic["vrs_id"].to_list()).intersection(set(clinvar_variants["out._id"].to_list()))
print(f"The number of matched GENIE genomic variants in ClinVar is: {len(genie_clinvar_match_genomic)}")
with open("variant_analysis_output/genie_clinvar_match_genomic.txt", "w") as f:
    genie_clinvar_match_genomic = list(genie_clinvar_match_genomic)
    for i in range(len(genie_clinvar_match_genomic)):
        f.write(genie_clinvar_match_genomic[i] + "\n")
f.close()

The number of matched GENIE genomic variants in ClinVar is: 94028


In [12]:
# Intersect normalized GENIE protein variants with normalized ClinVar variants
genie_clinvar_match_protein = set(able_to_normalize_protein["vrs_id"].to_list()).intersection(set(clinvar_variants["out._id"].to_list()))
print(f"The number of matched GENIE protein variants in ClinVar is: {len(genie_clinvar_match_protein)}")

The number of matched GENIE protein variants in ClinVar is: 0


## Patient-Level Analysis
In this section we summarize variant matching at the patient level, with >= 1 shared variant counting as a match.

### Filter to include normalized genomic and protein variants

In [15]:
normalized_queries_genomic = able_to_normalize_genomic["query"].to_list()
queries = genie_variants_df['coordinates'].to_list()
rows_to_keep_genomic = [queries.index(x) for x in normalized_queries_genomic]

In [16]:
genie_variants_genomic = genie_variants_df.iloc[rows_to_keep_genomic]

In [17]:
genie_variants_genomic.to_csv("genie_variants_genomic_normalized", sep="\t", index=False)

In [18]:
normalized_queries_protein = able_to_normalize_protein["query"].to_list()
queries = genie_variants_df["free_text_p_short"].to_list()
rows_to_keep_protein = [queries.index(x) for x in normalized_queries_protein]

In [19]:
genie_variants_protein = genie_variants_df.iloc[rows_to_keep_protein]

In [20]:
genie_variants_protein.to_csv("genie_variants_protein_normalized", sep="\t", index=False)

In [13]:
# Read in dataframes
genie_variants_genomic = pd.read_csv("genie_variants_genomic_normalized", sep="\t")
genie_variants_protein = pd.read_csv("genie_variants_protein_normalized", sep="\t")

In [14]:
genie_variants_genomic.head()

Unnamed: 0,Hugo_Symbol,NCBI_Build,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,dbSNP_RS,Tumor_Sample_Barcode,HGVSp_Short,free_text_p_short,coordinates
0,KRAS,GRCh37,12,25398285,25398285,C,A,rs121913530,GENIE-JHU-00006-00185,p.G12C,KRAS p.G12C,12-25398285-C-A
1,BRAF,GRCh37,7,140453136,140453136,A,T,rs113488022,GENIE-JHU-00006-00185,p.V600E,BRAF p.V600E,7-140453136-A-T
2,EGFR,GRCh37,7,55249071,55249071,C,T,rs121434569,GENIE-JHU-00006-00185,p.T790M,EGFR p.T790M,7-55249071-C-T
3,TP53,GRCh37,17,7577120,7577120,C,T,rs28934576,GENIE-JHU-00006-00185,p.R273H,TP53 p.R273H,17-7577120-C-T
4,NRAS,GRCh37,1,115256529,115256529,T,C,rs11554290,GENIE-JHU-00006-00185,p.Q61R,NRAS p.Q61R,1-115256529-T-C


In [15]:
genie_variants_protein.head()

Unnamed: 0,Hugo_Symbol,NCBI_Build,Chromosome,Start_Position,End_Position,Reference_Allele,Tumor_Seq_Allele2,dbSNP_RS,Tumor_Sample_Barcode,HGVSp_Short,free_text_p_short,coordinates
0,KRAS,GRCh37,12,25398285,25398285,C,A,rs121913530,GENIE-JHU-00006-00185,p.G12C,KRAS p.G12C,12-25398285-C-A
1,BRAF,GRCh37,7,140453136,140453136,A,T,rs113488022,GENIE-JHU-00006-00185,p.V600E,BRAF p.V600E,7-140453136-A-T
2,EGFR,GRCh37,7,55249071,55249071,C,T,rs121434569,GENIE-JHU-00006-00185,p.T790M,EGFR p.T790M,7-55249071-C-T
3,TP53,GRCh37,17,7577120,7577120,C,T,rs28934576,GENIE-JHU-00006-00185,p.R273H,TP53 p.R273H,17-7577120-C-T
4,NRAS,GRCh37,1,115256529,115256529,T,C,rs11554290,GENIE-JHU-00006-00185,p.Q61R,NRAS p.Q61R,1-115256529-T-C


In [16]:
# Create genomic variants patient dictionary
pv = genie_variants_genomic["Tumor_Sample_Barcode"].to_list()
pvd_genomic = dict()
for i in range(len(pv)):
    key = pv[i]
    var = able_to_normalize_genomic.at[i, "vrs_id"]
    if key in pvd_genomic:
        curr = pvd_genomic[key]
        curr.append(var)
        pvd_genomic[key] = curr
    else:
        pvd_genomic[key]= [var]

In [17]:
# Create protein variants patient dictionary
pv = genie_variants_protein["Tumor_Sample_Barcode"].to_list()
pvd_protein = dict()
for i in range(len(pv)):
    key = pv[i]
    var = able_to_normalize_protein.at[i, "vrs_id"]
    if key in pvd_protein:
        curr = pvd_protein[key]
        curr.append(var)
        pvd_protein[key] = curr
    else:
        pvd_protein[key]= [var]

### MOA

In [18]:
count = 0
patients = []
for key in pvd_protein:
    isect = set(pvd_protein[key]).intersection(genie_moa_match)
    if len(isect) >= 1:
        count += 1
        patients.append(key)
print(f"There are {count} patients with matching protein variant data in MOA")

with open("variant_analysis_output/moa_patient_match.txt", "w") as f:
    for i in range(len(patients)):
        f.write(patients[i] + "\n")
f.close()

There are 117 patients with matching protein variant data in MOA


### CIViC

In [19]:
# Protein variants patient count
count = 0
patients_protein = []
for key in pvd_protein:
    isect = set(pvd_protein[key]).intersection(genie_civic_match_protein)
    if len(isect) >= 1:
        count += 1
        patients_protein.append(key)
print(f"There are {count} patients with matching protein variant data in CIViC")

with open("variant_analysis_output/civic_protein_patient_match.txt", "w") as f:
    for i in range(len(patients_protein)):
        f.write(patients_protein[i] + "\n")
f.close()

There are 904 patients with matching protein variant data in CIViC


In [20]:
# Genomic variants patient count
count = 0
patients_genomic = []
for key in pvd_genomic:
    isect = set(pvd_genomic[key]).intersection(genie_civic_match_genomic)
    if len(isect) >= 1:
        count += 1
        patients_genomic.append(key)
print(f"There are {count} patients with matching genomic variant data in CIViC")

with open("variant_analysis_output/civic_genomic_patient_match.txt", "w") as f:
    for i in range(len(patients_genomic)):
        f.write(patients_genomic[i] + "\n")
f.close()

There are 201 patients with matching genomic variant data in CIViC


In [21]:
# Compute number of patients with both genomic and protein variant overlap
len(set(patients_genomic).intersection(set(patients_protein)))

8

### ClinVar

In [22]:
# Genomic variants patient count
count = 0
patients_genomic = []
for key in pvd_genomic:
    isect = set(pvd_genomic[key]).intersection(genie_clinvar_match_genomic)
    if len(isect) >= 1:
        count += 1
        patients_genomic.append(key)
print(f"There are {count} patients with matching genomic variant data in ClinVar")

with open("variant_analysis_output/clinvar_patient_match.txt", "w") as f:
    for i in range(len(patients_genomic)):
       f.write(patients_genomic[i] + "\n")
f.close()

There are 45725 patients with matching genomic variant data in ClinVar


## Variant Counts per Patient
Compute the average number of genomic and protein variants per patient.

In [23]:
counts = []
for key in pvd_genomic:
    counts.append(len(pvd_genomic[key]))
sum(counts)/len(counts)

6.263040815326572

In [24]:
counts = []
for key in pvd_protein:
    counts.append(len(pvd_protein[key]))
sum(counts)/len(counts)

5.682793441414131