# GENIE Search Analysis

This notebook performs set operations to determine matches between normalized variants from GENIE and normalized variants from CIViC, Molecular Oncology Almanac, and ClinVar.

## Prerequisites
The following notebook must be run before running this analysis: 

- `genie/pre_variant_analysis/genie_pre_variant_analysis.ipynb`

This notebook uses `data_mutations_extended.txt` from Synapse. You will need to create an account to download data from [here](https://www.synapse.org/#!Synapse:syn51355986). This notebook expects the `data_mutations_extended.txt` to be in the `pre_variant_analysis` directory.

In [1]:
# Import relevant libraries
import ndjson
import numpy as np
import pandas as pd
from pathlib import Path
from botocore.config import Config
import boto3
import gzip
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

## Load Data (CIViC, Molecular Oncology Almanac, ClinVar, GENIE)

In [3]:
# Load normalized variants for CIViC
civic_variants = pd.read_csv("../../civic/variation_analysis/able_to_normalize_queries.csv", sep="\t")

In [4]:
# Load normalized variants for MOA
moa_variants = pd.read_csv("../../moa/feature_analysis/able_to_normalize_queries.csv", sep="\t")

In [5]:
# Load normalized variants for ClinVar
s3 = boto3.resource("s3", config=Config(region_name="us-east-2"))
bucket = s3.Bucket("nch-igm-wagner-lab-public").objects.filter(Prefix="variation-normalizer-manuscript/output-variation_identity-vrs-1.3.ndjson.gz")

for file in bucket:
    fn = file.key.split("/")[-1]
    with open(fn, "wb") as f:
        file.Object().download_fileobj(f)

with gzip.open('output-variation_identity-vrs-1.3.ndjson.gz', 'rb') as f:
    records = ndjson.load(f)

clinvar_variants = pd.json_normalize(records)

In [6]:
# Get GENIE variant data
genie_variants_df = pd.read_csv("../pre_variant_analysis/variation_normalizer_output/genie_variants_df.csv", sep="\t")
genie_variants_df.shape

(1432052, 12)

### Normalized Genomic and Protein GENIE variants 
Due to GENIE licenses, please run the analysis notebook in `genie/pre_variant_analysis` to generate the datasets below.

In [7]:
genie_genomic_normalized = pd.read_csv("../pre_variant_analysis/variation_normalizer_output/able_to_normalize_genomic_queries.csv",sep="\t")

In [8]:
genie_protein_normalized = pd.read_csv("../pre_variant_analysis/variation_normalizer_output/able_to_normalize_protein_queries.csv",sep="\t")

## Create Output Directory

In [9]:
path = Path("variant_analysis_output")
path.mkdir(exist_ok = True)

## Variant-Level Analysis
In this section we perform set intersections to determine how many normalized variants from GENIE match to MOA, CIViC, and ClinVar.

In [10]:
# Helper functions
def write_to_file(filename: str, data: list) -> None:
    """Write variant/patient data to a file
    :param string filename: The file to write to
    :param list data: A list of variants or patients
    """
    with open(filename, "w") as f:
        for match in data:
            f.write(match + "\n")

def set_intersection(genie: list, kb: list) -> set:
    """Intersect GENIE variants with variants from different knowledgebases
    :param list genie: A list of GENIE variants
    :param list kb: A list of variants from a knowledgebase
    :return: set of common variants
    """
    return set(genie).intersection(kb)

def var_analysis(genie: list, kb: list, filename: str) -> tuple:
    """Perform set intersection, write to file for variant analysis
    :param list genie: A list of GENIE variants
    :param list kb: A list of variants from a knowledgebase
    :param str filename: The file to write to
    :return: tuple of shared variants and the length of the list
    """
    match = set_intersection(genie, kb)
    write_to_file(filename, list(match))
    return match, len(match)

def check_kb(var: str, kb: set) -> bool:
    """Check if a variant is in a knowledgebase
    :param str var: A variant
    :param set kb: A set of variants
    :return: True if variant is in knowledgebase
    """
    return var in kb

## MOA

In [11]:
# Intersect normalized GENIE protein variants with normalized MOA protein variants
genie_moa_match = var_analysis(genie_protein_normalized["vrs_id"].to_list(), set(moa_variants["vrs_id"].to_list()),
                              "variant_analysis_output/genie_moa_match.txt")
f"The number of matched protein variants in MOA is: {genie_moa_match[1]}"

'The number of matched protein variants in MOA is: 122'

## CIViC

In [12]:
# Intersect normalized GENIE genomic variants with normalized CIViC variants
genie_civic_match_genomic = var_analysis(genie_genomic_normalized["vrs_id"].to_list(), set(civic_variants["vrs_id"].to_list()),
                                        "variant_analysis_output/genie_civic_match_genomic.txt")
f"The number of matched genomic variants in CIViC is: {genie_civic_match_genomic[1]}"

'The number of matched genomic variants in CIViC is: 203'

In [13]:
# Intersect normalized GENIE protein variants with normalized CIViC variants
genie_civic_match_protein = var_analysis(genie_protein_normalized["vrs_id"].to_list(), set(civic_variants["vrs_id"].to_list()),
                                        "variant_analysis_output/genie_civic_match_protein.txt")
f"The number of matched protein variants in CIViC is: {genie_civic_match_protein[1]}"

'The number of matched protein variants in CIViC is: 967'

In [14]:
# Inspect union of GENIE genomic and protein variants
civic_genie_union = genie_civic_match_genomic[0].union(genie_civic_match_protein[0])
write_to_file("variant_analysis_output/genie_civic_match_union.txt", civic_genie_union)
f"The number of matched variants in CIViC is: {len(civic_genie_union)}"

'The number of matched variants in CIViC is: 1170'

## ClinVar

In [15]:
# Intersect normalized GENIE genomic variants with normalized ClinVar variants
genie_clinvar_match_genomic = var_analysis(genie_genomic_normalized["vrs_id"].to_list(), set(clinvar_variants["out._id"].to_list()),
                                          "variant_analysis_output/genie_clinvar_match_genomic.txt")
f"The number of matched genomic variants in ClinVar is: {genie_clinvar_match_genomic[1]}"

'The number of matched genomic variants in ClinVar is: 94300'

In [16]:
# Intersect normalized GENIE protein variants with normalized ClinVar variants
genie_clinvar_match_protein = set_intersection(genie_protein_normalized["vrs_id"].to_list(), set(clinvar_variants["out._id"].to_list()))
f"The number of matched protein variants in ClinVar is: {len(genie_clinvar_match_protein)}"

'The number of matched protein variants in ClinVar is: 0'

## Patient-Level Analysis
In this section we summarize variant matching at the patient level, with >= 1 shared variant counting as a match.

### Filter to include normalized genomic and protein variants

In [17]:
# Helper function - create dataframes with vrs_ids for genomic and protein variants
def get_vrs_ids(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """Given a list of GENIE variants, add a column containing the VRS IDs
    :param pd.DataFrame df: A dataframe containing the VRS IDs for each variant
    :param str col_name: Indicates if genomic or protein variants will be added
    :return: pd.DataFrame The updated genie_variants dataframe
    """
    genie_variants_mapping = {}
    for row in df.itertuples(index=False):
        if row.query not in genie_variants_mapping:
            genie_variants_mapping[row.query] = row.vrs_id

    vrs_ids_list = []
    queries = genie_variants_df[col_name].to_list()
    for query in queries:
        vrs_ids_list.append(genie_variants_mapping.get(query, np.nan))
            
    if col_name == "coordinates":
        genie_variants_df["vrs_id_genomic"] = vrs_ids_list
        genie_variants_genomic = genie_variants_df[genie_variants_df["vrs_id_genomic"].notna()]
        return genie_variants_genomic
    else:
        genie_variants_df["vrs_id_protein"] = vrs_ids_list
        genie_variants_protein = genie_variants_df[genie_variants_df["vrs_id_protein"].notna()]
        return genie_variants_protein

In [18]:
genie_variants_genomic = get_vrs_ids(genie_genomic_normalized, "coordinates")

In [19]:
genie_variants_protein = get_vrs_ids(genie_protein_normalized, "free_text_p_short")

In [20]:
# Add columns indicating which variants are in which knowledgebase
genie_variants_df['in_civic_genomic'] = genie_variants_df['vrs_id_genomic'].map(lambda x: check_kb(x, genie_civic_match_genomic[0]))
genie_variants_df['in_clinvar'] = genie_variants_df['vrs_id_genomic'].map(lambda x: check_kb(x, genie_clinvar_match_genomic[0]))
genie_variants_df['in_civic_protein'] = genie_variants_df['vrs_id_protein'].map(lambda x: check_kb(x, genie_civic_match_protein[0]))
genie_variants_df['in_moa'] = genie_variants_df['vrs_id_protein'].map(lambda x: check_kb(x, genie_moa_match[0]))

In [21]:
# Helper function - create patient dictionary
def create_dict(df: pd.DataFrame, var_type: str) -> dict:
    """Create a dictionary reporting the variants a patient has
    :param pd.DataFrame df: A dataframe with GENIE genomic or protein variants
    :param str var_type: Genomic or protein
    :return: A dictionary of patient variants
    """
    patient_dict = dict()
    for row in df.itertuples(index=False):
        key = row.Tumor_Sample_Barcode
        var = row.vrs_id_genomic if var_type == "genomic" else row.vrs_id_protein
        if key in patient_dict:
            curr = patient_dict[key]
            if not pd.isna(var):
                curr.append(var)
            patient_dict[key] = curr
        else:
            if not pd.isna(var):
                patient_dict[key]= [var]
    return patient_dict

In [22]:
# Create genomic variants patient dictionary
pvd_genomic = create_dict(genie_variants_genomic, "genomic")

In [23]:
# Create protein variants patient dictionary
pvd_protein = create_dict(genie_variants_protein, "protein")

In [24]:
# Patient Analysis Helper Function
def patient_matching(patient_dict: dict, var_set: set, filename: str) -> list:
    """Perform patient matching for a given source
    :param dict patient_dict: A dictionary of patient variants
    :param str var_set: A set of common variants
    :param str filename: The file to write to
    :return: list A list of patients with common variants, and the number of patients in that list
    """ 
    count = 0
    patients = []
    for key in patient_dict:
        isect = set(patient_dict[key]).intersection(var_set)
        if isect:
            count += 1
            patients.append(key)
    write_to_file(filename, patients)
    return count, patients

### MOA

In [25]:
moa_match = patient_matching(pvd_protein, genie_moa_match[0], "variant_analysis_output/moa_patient_match.txt")
f"There are {moa_match[0]} patients with matching protein variant data in MOA"

'There are 41828 patients with matching protein variant data in MOA'

### CIViC

In [26]:
civic_protein_match = patient_matching(pvd_protein, genie_civic_match_protein[0], "variant_analysis_output/civic_protein_patient_match.txt")
f"There are {civic_protein_match[0]} patients with matching protein variant data in CIViC"

'There are 84861 patients with matching protein variant data in CIViC'

In [27]:
civic_genomic_match = patient_matching(pvd_genomic, genie_civic_match_genomic[0], "variant_analysis_output/civic_genomic_patient_match.txt")
f"There are {civic_genomic_match[0]} patients with matching genomic variant data in CIViC"

'There are 853 patients with matching genomic variant data in CIViC'

In [28]:
# Compute number of patients with both genomic and protein variant overlap
len(set(civic_genomic_match[1]).intersection(set(civic_protein_match[1])))

272

### ClinVar

In [29]:
genie_clinvar_patient_match = patient_matching(pvd_genomic, genie_clinvar_match_genomic[0], 
                                              "variant_analysis_output/clinvar_patient_match.txt")
f"There are {genie_clinvar_patient_match[0]} patients with matching genomic variant data in ClinVar"

'There are 127103 patients with matching genomic variant data in ClinVar'

## Variant Counts per Patient
Compute the average number of genomic and protein variants per patient.

In [30]:
def avg(var_dict: dict) -> float:
    """Return the average number of variants per patient
    :param dict var_dict: A dictionary of patient variants
    :return: float The average number of normalized variants
    """
    counts = []
    for key in var_dict:
        counts.append(len(var_dict[key]))
    return sum(counts)/len(counts)

In [31]:
# Normalized average, genomic
avg(pvd_genomic)

8.703645296719163

In [32]:
# Normalized average, protein
avg(pvd_protein)

7.9632307941219915

In [33]:
patient_vars = dict()
for row in genie_variants_df.itertuples(index=False):
    key = row.Tumor_Sample_Barcode
    var_g = row.vrs_id_genomic
    var_p = row.vrs_id_protein
    if key in patient_vars:
        if pd.isna(var_g) and pd.isna(var_p):
            continue
        curr = patient_vars[key]
        curr += 1
        patient_vars[key] = curr
    else:
        if pd.isna(var_g) and pd.isna(var_p):
            continue
        patient_vars[key] = 1

In [34]:
# Average number of normalized variants per patient
counts = []
for key in patient_vars:
    counts.append(patient_vars[key])
sum(counts)/len(counts)

8.860833124843634

In [35]:
def kb_match(df: pd.DataFrame) -> float:
    """Compute the average number of variants per patient given a knowledgebase
    :param pd.DataFrame df: The filtered dataframe of patient variants
    :return: float The average number of variants
    """
    n_variants = list(df.groupby('Tumor_Sample_Barcode').size())
    return sum(n_variants)/len(n_variants)

In [36]:
# Average number of total variants per patient
kb_match(genie_variants_df)

9.75133633398476

In [37]:
# Normalized matched genomic variants per patient in CIViC
genie_df_civic_genomic = genie_variants_df[genie_variants_df['in_civic_genomic'] == True]
kb_match(genie_df_civic_genomic)

1.010550996483001

In [38]:
# Normalized matched protein variants per patient in CIViC
genie_df_civic_protein = genie_variants_df[genie_variants_df['in_civic_protein'] == True]
kb_match(genie_df_civic_protein)

1.3796207916475176

In [39]:
# Normalized matched protein variants per patient in MOA
genie_df_moa = genie_variants_df[genie_variants_df['in_moa'] == True]
kb_match(genie_df_moa)

1.0997417997513628

In [40]:
# Normalized matched genomic variants per patient in ClinVar
genie_df_clinvar = genie_variants_df[genie_variants_df['in_clinvar'] == True]
kb_match(genie_df_clinvar)

3.22410958041903

In [41]:
# Normalized matching across knowlegebases, genomic
genie_df_genomic_combined = genie_variants_df[(genie_variants_df['in_civic_genomic'] == True) | (genie_variants_df['in_clinvar'] == True)]
kb_match(genie_df_genomic_combined)

3.2239036053608507

In [42]:
# Normalized matching across knowlegebases, protein
genie_df_protein_combined = genie_variants_df[(genie_variants_df['in_civic_protein'] == True) | (genie_variants_df['in_moa'] == True)]
kb_match(genie_df_protein_combined)

1.391213535589265

In [43]:
# Normalized matching across all knowledgebases
genie_df_filtered= genie_variants_df[(genie_variants_df['in_civic_protein'] == True) | (genie_variants_df['in_moa'] == True) |
(genie_variants_df['in_civic_genomic'] == True) | (genie_variants_df['in_clinvar'] == True)]
kb_match(genie_df_filtered)

3.247233190806639