In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator

In [None]:
# input files/dir
ANNOTATED_MUTATIONS = "../data/annotated_snv_mv_indels_by_cancer_subtype"
GENOME_WIDE_MUTATIONS = "../data/genome_wide_mutation_data.tsv"
DRIVER_GENES = "../data/driver_genes"
WHITELISTED_SAMPLES = "../data/datasets/PCAWG/supplementary Tables/Supplementary Table 1.csv"

# output files/dirs=
MUTATION_DENSITY_GENIC_REGIONS = "../plot_data/mutation_density_genic_regions"

if not os.path.exists(MUTATION_DENSITY_GENIC_REGIONS):
    os.makedirs(MUTATION_DENSITY_GENIC_REGIONS)

In [3]:
cancer_type = "Liver-HCC"

In [4]:
whitelisted_data = pd.read_csv(WHITELISTED_SAMPLES, sep=",", header=0)
whitelisted_samples = whitelisted_data["tumour_specimen_aliquot_id"].unique().tolist()
print(f"Number of unique samples in the whitelist: {len(whitelisted_samples)}")

Number of unique samples in the whitelist: 2583


In [5]:
# get all mutations in driver genes
mut_df = pd.read_csv(os.path.join(ANNOTATED_MUTATIONS, cancer_type + ".tsv"), sep="\t")
driver_genes = pd.read_csv(os.path.join(DRIVER_GENES, cancer_type + ".tsv"), sep="\t")
driver_genes_list = driver_genes["gene"].tolist()
mut_df = mut_df[mut_df["gene"].isin(driver_genes_list)]
mut_df = pd.merge(mut_df, driver_genes, on="gene", how="left")

In [6]:
print(mut_df.shape)
# filter out samples that are not in the whitelist
mut_df = mut_df[mut_df["Tumor_Sample_Barcode"].isin(whitelisted_samples)]
print(mut_df.shape)

(6275, 32)
(6043, 32)


In [7]:
print(mut_df["genic_region"].value_counts())
mut_df["genic_region"] = mut_df["genic_region"].replace({
    "ncRNA_exonic": "ncRNA",
    "ncRNA_intronic": "ncRNA"
})
regions = ["UTR5", "UTR3", "ncRNA", "exonic", "intronic", "splicing", "upstream", "downstream"]
mut_df["genic_region"] = mut_df["genic_region"].apply(lambda x: x if x in regions else "-")
mut_df["genic_region"].value_counts()

genic_region
intronic          5435
exonic             376
ncRNA_intronic      76
intergenic          72
upstream            21
splicing            19
UTR3                16
ncRNA_exonic        14
downstream          10
UTR5                 4
Name: count, dtype: int64


genic_region
intronic      5435
exonic         376
ncRNA           90
-               72
upstream        21
splicing        19
UTR3            16
downstream      10
UTR5             4
Name: count, dtype: int64

In [8]:
region_length_map = {
    "intronic": "intron_length",
    "exonic": "exon_length",
    "UTR3": "3UTR_length",
    "UTR5": "5UTR_length"
}

def get_region_length(row):
	if row["genic_region"] in region_length_map:
		return row[region_length_map[row["genic_region"]]]
	elif row["genic_region"] == "upstream" or row["genic_region"] == "downstream":
		return 1000
	elif row["genic_region"] == "splicing" or row["genic_region"] == "ncRNA":
		return 1 # regions not well defined

mut_df["region_length"] = mut_df.apply(get_region_length, axis=1)

In [9]:
# passenger density for each gene in each sample
passenger_density = mut_df.groupby(["Tumor_Sample_Barcode", "gene", "genic_region"]).agg({
    "driver": lambda x: x.value_counts().get(False, 0),
    "has_driver": "any", # driver mutation in any genic region of the gene
    "region_length": "first"
}).reset_index()
passenger_density = passenger_density.rename(columns={"driver": "passenger_density"})
# adjust for genome-wide mutation rate, cna burden
genome_wide_mutations = pd.read_csv(GENOME_WIDE_MUTATIONS, sep="\t")
print(passenger_density.shape, genome_wide_mutations.shape)
passenger_density = pd.merge(passenger_density, genome_wide_mutations, on=["Tumor_Sample_Barcode"], how="left")
passenger_density["cna_burden"] = passenger_density["cna_burden"] / 100 # convert from percentage to fraction
print(passenger_density.shape)
passenger_density["adj_passenger_density"] = passenger_density["passenger_density"] * 3.2e9 / (passenger_density["total_mutations"] * passenger_density["cna_burden"] * passenger_density["region_length"])
passenger_density = passenger_density[["Tumor_Sample_Barcode", "gene", "has_driver", "genic_region", "passenger_density", "total_mutations", "adj_passenger_density"]]

(1492, 6) (2778, 4)
(1492, 9)


In [10]:
driver_status = passenger_density[['Tumor_Sample_Barcode', 'gene', 'has_driver']].drop_duplicates()
driver_status = driver_status.set_index(['Tumor_Sample_Barcode', 'gene']).to_dict()['has_driver']

In [11]:
# add rows for samples with no mutations in driver genes
tumors = passenger_density["Tumor_Sample_Barcode"].unique()
for tumor in tumors:
	driver_genes = pd.read_csv(f"{DRIVER_GENES}/{cancer_type}.tsv", sep="\t")
	driver_genes_list = driver_genes["gene"].tolist()
	for gene in driver_genes_list:
		for region in regions:
			if len(passenger_density[(passenger_density["Tumor_Sample_Barcode"] == tumor) & (passenger_density["gene"] == gene) & (passenger_density["genic_region"] == region)]) == 0:
				passenger_density = pd.concat([passenger_density, pd.DataFrame({
					"Tumor_Sample_Barcode": [tumor], 
					"gene": [gene], 
					"has_driver": [driver_status.get((tumor, gene), False)],
					"genic_region": [region],
					"passenger_density": [0],
					"total_mutations": [genome_wide_mutations[genome_wide_mutations["Tumor_Sample_Barcode"] == tumor]["total_mutations"].values[0]],
					"adj_passenger_density": [0],
				})])
print(passenger_density.shape)
passenger_density.to_csv(f"{MUTATION_DENSITY_GENIC_REGIONS}/{cancer_type}.tsv", sep="\t", index=False)

(32722, 7)
