In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# input files/dir
ANNOTATED_MUTATIONS = "../data/annotated_snv_mv_indels_by_cancer_subtype"
DRIVER_GENES = "../data/driver_genes"
WHITELISTED_SAMPLES = "../data/datasets/PCAWG/supplementary Tables/Supplementary Table 1.csv"

# output files/dirs
CADD_SCORES_GENIC_REGIONS = "../plot_data/cadd_scores_genic_regions"
    
if not os.path.exists(CADD_SCORES_GENIC_REGIONS):
    os.makedirs(CADD_SCORES_GENIC_REGIONS)

In [3]:
cancer_type = "Liver-HCC"

In [4]:
whitelisted_data = pd.read_csv(WHITELISTED_SAMPLES, sep=",", header=0)
whitelisted_samples = whitelisted_data["tumour_specimen_aliquot_id"].unique().tolist()
print(f"Number of unique samples in the whitelist: {len(whitelisted_samples)}")

Number of unique samples in the whitelist: 2583


In [5]:
# get all mutations in driver genes
mut_df = pd.read_csv(os.path.join(ANNOTATED_MUTATIONS, cancer_type + ".tsv"), sep="\t")
driver_genes = pd.read_csv(os.path.join(DRIVER_GENES, cancer_type + ".tsv"), sep="\t")
driver_genes_list = driver_genes["gene"].tolist()
mut_df = mut_df[mut_df["gene"].isin(driver_genes_list)]
mut_df = pd.merge(mut_df, driver_genes[["gene", "gene_length"]], on="gene", how="left")
print(mut_df.shape)
# filter out samples that are not in the whitelist
mut_df = mut_df[mut_df["Tumor_Sample_Barcode"].isin(whitelisted_samples)]
print(mut_df.shape)

(6275, 14)
(6043, 14)


In [6]:
# filter out indels and MNVs without CADD scores
print(mut_df.shape)
mut_df = mut_df[mut_df["CADD_score_raw"].notnull()]
print(mut_df.shape)
mut_df.reset_index(inplace=True)
print(mut_df.shape)

# normalize CADD scores for each gene
mut_df["CADD_score_normalized"] = mut_df.groupby(["gene"])["CADD_score_raw"].transform(lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() != x.min() else 0.5)

# get driver status
mut_df["has_driver"] = mut_df["has_driver"].apply(lambda x: "Passengers in presence of driver" if x else "Passengers in absence of driver")
mut_df["has_driver"] = mut_df.apply(lambda x: "Drivers" if x["driver"] == True else x["has_driver"], axis=1)

(6043, 14)
(5562, 14)
(5562, 15)


In [7]:
print(mut_df["genic_region"].value_counts())
mut_df["genic_region"] = mut_df["genic_region"].replace({
    "ncRNA_exonic": "ncRNA",
    "ncRNA_intronic": "ncRNA"
})
regions = ["UTR5", "UTR3", "ncRNA", "exonic", "intronic", "splicing", "upstream", "downstream", "intergenic"]
mut_df["genic_region"] = mut_df["genic_region"].apply(lambda x: x if x in regions else "-")
mut_df.to_csv(f"{CADD_SCORES_GENIC_REGIONS}/{cancer_type}.tsv", sep="\t", index=False)
mut_df["genic_region"].value_counts()

genic_region
intronic          5037
exonic             314
ncRNA_intronic      69
intergenic          62
upstream            20
splicing            18
UTR3                15
ncRNA_exonic        13
downstream          10
UTR5                 4
Name: count, dtype: int64


genic_region
intronic      5037
exonic         314
ncRNA           82
intergenic      62
upstream        20
splicing        18
UTR3            15
downstream      10
UTR5             4
Name: count, dtype: int64