In [14]:
import os
import pandas as pd
import numpy as np

In [15]:
icgc_mut = "../data/datasets/PCAWG/mutations/snv_mnv_indel/final_consensus_passonly.snv_mnv_indel.icgc.public.maf"
tcgc_mut = "../data/datasets/PCAWG/mutations/snv_mnv_indel/final_consensus_passonly.snv_mnv_indel.tcga.controlled.maf"
icgc_drivers = "../data/datasets/PCAWG/driver_mutations/TableS3_panorama_driver_mutations_ICGC_samples.public.tsv"
tcga_drivers = "../data/datasets/PCAWG/driver_mutations/TableS3_panorama_driver_mutations_TCGA_samples.controlled.tsv"
cnv_data = "../data/datasets/PCAWG/mutations/cnv"
suppl_table = "../data/datasets/PCAWG/supplementary Tables/Supplementary Table 1.csv"
mut_dir = "../data/snv_mv_indels_by_cancer_subtype"
GENOME_WIDE_MUT_DATA = "../data/genome_wide_mutation_data.tsv"
MANIFEST_FILE = "../data/datasets/TCGA/gdc_manifest.2025-10-03.170810.txt"
SAMPLE_SHEET = "../data/datasets/TCGA/gdc_sample_sheet.2025-10-03.tsv"

In [16]:
suppl_data = pd.read_csv(suppl_table, sep=",", header=0)
whitelisted_samples = suppl_data["tumour_specimen_aliquot_id"].unique().tolist()
print(f"Number of unique samples in the whitelist: {len(whitelisted_samples)}")

Number of unique samples in the whitelist: 2583


In [17]:
tcga_specimen = suppl_data[suppl_data["submitted_specimen_id"].str.contains("TCGA")]
tcga_specimen_ids = tcga_specimen["submitted_specimen_id"].unique().tolist()
print(f"Number of unique TCGA samples in the whitelist: {len(tcga_specimen_ids)}")
tcga_patient_ids = [x.split("-")[0] + "-" + x.split("-")[1] + "-" + x.split("-")[2] for x in tcga_specimen_ids]
tcga_patient_ids = list(set(tcga_patient_ids))
print(f"Number of unique TCGA patients in the whitelist: {len(tcga_patient_ids)}")

Number of unique TCGA samples in the whitelist: 801
Number of unique TCGA patients in the whitelist: 801


In [18]:
manifest_df = pd.read_csv(MANIFEST_FILE, sep="\t")
manifest_df.drop_duplicates(inplace=True)
sample_df = pd.read_csv(SAMPLE_SHEET, sep="\t")
sample_df.drop_duplicates(inplace=True)
manifest_df = manifest_df.merge(sample_df, left_on="id", right_on="File ID", how="left")
manifest_df["Patient_ID"] = manifest_df["Case ID"].str.split(", ").str[0]
patients = manifest_df["Patient_ID"].unique().tolist()
print("Total TCGA patients in the manifest:", len(patients))

Total TCGA patients in the manifest: 8536


In [19]:
extra_tcga_patients = set(patients) - set(tcga_patient_ids)
print("Number of extra TCGA patients not in the PCAWG dataset:", len(extra_tcga_patients))

Number of extra TCGA patients not in the PCAWG dataset: 7958


In [20]:
icgc_mut = pd.read_csv(icgc_mut, sep="\t", low_memory=False)
icgc_mut = icgc_mut = icgc_mut[["Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "Variant_Classification", "Tumor_Sample_Barcode", "Project_Code", "Donor_ID"]]
tcga_mut = pd.read_csv(tcgc_mut, sep="\t", low_memory=False)
tcga_mut = tcga_mut[["Hugo_Symbol", "Chromosome", "Start_position", "End_position", "Strand", "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "Variant_Classification", "Tumor_Sample_Barcode", "Project_Code"]]
mut_df = pd.concat([icgc_mut, tcga_mut], axis=0, ignore_index=True)

In [21]:
drivers_icgc = pd.read_csv(icgc_drivers, sep="\t", low_memory=False)
drivers_tcga = pd.read_csv(tcga_drivers, sep="\t", low_memory=False)
driver_mut = pd.concat([drivers_icgc, drivers_tcga], axis=0, ignore_index=True)

In [22]:
all_samples = mut_df[["Tumor_Sample_Barcode", "Project_Code"]].drop_duplicates()
all_samples["whitelisted"] = all_samples["Tumor_Sample_Barcode"].apply(lambda x: True if x in whitelisted_samples else False)
all_samples["tissue"] = all_samples["Project_Code"].apply(lambda x: x.split("-")[0])
all_samples.rename(columns={
    "Tumor_Sample_Barcode": "sample_id",
	"Project_Code": "ttype"
}, inplace=True)
all_samples = all_samples[all_samples["whitelisted"] == True]
print(f"Number of unique samples in the whitelist: {len(all_samples)}")

Number of unique samples in the whitelist: 2583


In [23]:
driver_count = driver_mut.groupby(["sample_id", "ttype"]).size().reset_index(name="num_drivers")
print(driver_count.shape)
driver_count = all_samples.merge(driver_count, how="left", on=["sample_id", "ttype"])
driver_count["num_drivers"] = driver_count["num_drivers"].fillna(0)
print(driver_count.shape)

(2354, 3)
(2583, 5)


In [24]:
# percentage of samples with driver mutations
driver_count["has_driver"] = driver_count["num_drivers"].apply(lambda x: 1 if x > 0 else 0)
driver_count["has_driver"] = driver_count["has_driver"].astype(int)
counts = driver_count["has_driver"].value_counts()
print(counts)
print(f"Percentage of samples with driver mutations: {counts[1] / counts.sum() * 100:.2f}%")
# average driver count per sample
mean_driver_count = driver_count["num_drivers"].mean()
print(f"Mean driver count: {mean_driver_count}")
# average driver count per tissue type
mean_driver_count_by_tissue = driver_count.groupby("tissue")["num_drivers"].mean().reset_index()
mean_driver_count_by_tissue.sort_values("num_drivers", ascending=False, inplace=True)

has_driver
1    2354
0     229
Name: count, dtype: int64
Percentage of samples with driver mutations: 91.13%
Mean driver count: 5.304297328687572


In [25]:
driver_gene_count = driver_mut.groupby(["sample_id", "gene"]).size().reset_index(name="num_drivers")
print(driver_gene_count.shape)
# stats on number of drivers
print(driver_gene_count["num_drivers"].describe())

(10932, 3)
count    10932.000000
mean         1.253293
std          0.466775
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: num_drivers, dtype: float64


In [26]:
driver_gene_count["num_drivers"].value_counts()

num_drivers
1    8281
2    2557
3      80
4       8
5       3
6       2
7       1
Name: count, dtype: int64