# Data Loading

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import seaborn as sns

In [6]:
# Loaded variable 'df' from URI: h:\My Drive\Pathogenic_Landscape\data\absolute\clinical_research_filtered_combined\final_filtered_combined.tsv
ogdf = pd.read_csv(r'D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\data\absolute\clinical_research_filtered_combined\final_filtered_combined.tsv', sep='\t', engine='pyarrow')

Total number of variants = 42,06,676

In [7]:
print(ogdf.shape)

(4206676, 15)


In [8]:
ogdf.head(n=500)

Unnamed: 0,Sample_Name,CHROM_x,POS_x,End_x,REF_x,ALT_x,Ref.Gene,Func.ensGene,ExonicFunc.ensGene,AAChange.ensGene,Interpro_domain,avsnp150,CLNDN,CLNDISDB,clinvar: Clinvar
0,A549-NCCS-SE8,chr1,1858465,1858466,GGA,G,CFAP74,exonic,frameshift deletion,ENSG00000142609:ENST00000493964:exon12:c.1378_...,.,.,.,.,clinvar: UNK
1,A549-NCCS-SE8,chr1,1861752,1861761,AGCCCGGCAAG,A,CFAP74,exonic,frameshift deletion,ENSG00000142609:ENST00000493964:exon9:c.875_88...,.,.,.,.,clinvar: UNK
2,A549-NCCS-SE8,chr1,2328643,2328643,CG,C,RER1,exonic,frameshift deletion,ENSG00000157916:ENST00000488353:exon2:c.170del...,.,.,.,.,clinvar: UNK
3,A549-NCCS-SE8,chr1,2411743,2411743,A,AACAT,PLCH2,exonic,frameshift insertion,ENSG00000149527:ENST00000378486:exon4:c.637_63...,.,.,.,.,clinvar: UNK
4,A549-NCCS-SE8,chr1,2433637,2433637,G,GA,PLCH2,exonic,frameshift insertion,ENSG00000149527:ENST00000288766:exon7:c.612_61...,.,.,.,.,clinvar: UNK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,A549-NCCS-SE8,chr11,134118857,134118860,ATGTG,A,THYN1,intronic,.,.,.,.,.,.,clinvar: UNK
496,A549-NCCS-SE8,chr12,308079,308079,T,TACA,SLC6A12,exonic,nonframeshift insertion,ENSG00000111181:ENST00000397296:exon7:c.729_73...,.,.,.,.,clinvar: UNK
497,A549-NCCS-SE8,chr12,432364,432364,GT,G,KDM5A,exonic,frameshift deletion,ENSG00000073614:ENST00000544760:exon7:c.1016de...,.,.,.,.,clinvar: UNK
498,A549-NCCS-SE8,chr12,527799,527799,T,TA,CCDC77,exonic,frameshift insertion,ENSG00000120647:ENST00000540180:exon3:c.314_31...,.,.,.,.,clinvar: UNK


In [9]:
# Number of unique samples
unique_sample_count = ogdf['Sample_Name'].nunique()

print(unique_sample_count)

1675


Clinvar Variant Annotations and their count

In [10]:
ogdf['clinvar: Clinvar '].value_counts()

clinvar: Clinvar 
clinvar: UNK                                                                  4166471
clinvar: Uncertain_significance                                                 23205
clinvar: Conflicting_interpretations_of_pathogenicity                           10627
clinvar: Pathogenic                                                              2420
clinvar: not_provided                                                            1281
clinvar: Likely_pathogenic                                                       1007
.                                                                                 510
clinvar: Pathogenic/Likely_pathogenic                                             448
clinvar: other                                                                    344
clinvar: drug_response                                                            133
clinvar: Conflicting_interpretations_of_pathogenicity,_risk_factor                 58
clinvar: risk_factor                

---


In [14]:
#--- Hotspot Variant Markup ---#
# Load the text file with hotspot variants
hotspot_source_path = r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\hotspot_v14.txt"
hotspot_source_df = pd.read_csv(hotspot_source_path, sep="\t")

# Extract relevant columns from hotspot data
hotspot_source_variants = set(zip(hotspot_source_df["#CHROM"], hotspot_source_df["POS"], hotspot_source_df["REF"], hotspot_source_df["ALT"]))

# Label rows based on hotspot variants
ogdf["Hotspot"] = ogdf.apply(lambda row: 1 if (row['CHROM_x'], row['POS_x'], row['REF_x'], row['ALT_x']) in hotspot_source_variants else 0, axis=1)

In [15]:
#--- Oncogene and TSG Gene Markup ---#
# Load the text file with oncogene and tumor suppressor gene info obtained from OncoKB
oncog_tsg_source_path = r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\Oncogene_TSG.txt"
oncog_tsg_source_df = pd.read_csv(oncog_tsg_source_path, sep="\t")

# Convert to a dictionary for quick lookup
oncog_tsg_dict = dict(zip(oncog_tsg_source_df["Hugo Symbol"], oncog_tsg_source_df["OncoG_TSG"]))

# Identify the gene column
gene_column = "Ref.Gene"  # Assuming this column contains gene names

if gene_column in ogdf.columns:
    ogdf["OncoG_TSG"] = ogdf[gene_column].map(oncog_tsg_dict).fillna("")
else:
    raise ValueError(f"The required column '{gene_column}' is missing in the dataset.")

In [16]:
#--- Load the kinase gene list ---#
kinase_gene_file = r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\Kinase_Genes.txt"
with open(kinase_gene_file, "r") as f:
    kinase_genes = set(gene.strip() for gene in f.readlines())

# Add the "Kinase" column
ogdf["Kinase"] = ogdf["Ref.Gene"].apply(lambda gene: 1 if gene in kinase_genes else 0)

In [17]:
#--- Removing Non-Cancer Samples ---#
# List of non-cancer samples to remove
samples_to_remove = [
    'IN-423-TLRA-F-Merged-IE', '005N_IE', '006N_IE', '009N_IE', '010N_IE', '011N_IE']
# Drop rows where 'Sample_Name' column contains any of the values in the list
df = ogdf[~ogdf['Sample_Name'].isin(samples_to_remove)]
print(df.shape)
df['Sample_Name'].nunique()

(4202661, 18)


1669

In [18]:
#--- Cancer info Markup ---#
# Load the text file with curated cancer info
cancer_source_path = r"D:\4bc_Gdrive\My Drive\Pathogenic_Landscape\assets\Absolute Cancer\Cohort_Cancer.txt"
cancer_source_df = pd.read_csv(cancer_source_path, sep="\t")

# Merge the cancer information based on the Sample_Name column
df = df.merge(cancer_source_df, on="Sample_Name", how="left")

In [19]:
#--- Removing N/A Cancer Samples ---#
df = df.dropna(subset=['Cancer'])

In [20]:
# Cancer Counts
df.groupby('Cancer')['Sample_Name'].nunique().reset_index(name='Unique_Sample_Count')

Unnamed: 0,Cancer,Unique_Sample_Count
0,Brain,11
1,Breast,109
2,Cervix,13
3,Esophagus,25
4,Gallbladder,61
5,Head and Neck,68
6,Kidney,13
7,Large Intestine,111
8,Liver and intrahepatic bile ducts,39
9,Lung,194


In [21]:
# Mapping Dictionary
cancer_mapping = {
    "Brain": "BrC",
    "Breast": "BC",
    "Cervix": "CC",
    "Esophagus": "EC",
    "Gallbladder": "GBC",
    "Head and Neck": "HNC",
    "Kidney": "KC",
    "Large Intestine": "LIC",
    "Liver and intrahepatic bile ducts": "LBC",
    "Lung": "LC",
    "Oral": "OC",
    "Others": "Others",
    "Ovary": "OvC",
    "Pancreas": "PC",
    "Prostate": "PrC",
    "Rectum": "RC",
    "Sarcoma": "SC",
    "Stomach": "StC",
    "Thyroid": "TC",
    "Unknown primary": "UPC",
    "Urinary Bladder": "UBC",
    "Uterus": "UtC"
}

# Apply mapping
df["Cancer_Short"] = df["Cancer"].map(cancer_mapping)

In [22]:
# Cancer Counts
df.groupby('Cancer_Short')['Sample_Name'].nunique().reset_index(name='Unique_Sample_Count')

Unnamed: 0,Cancer_Short,Unique_Sample_Count
0,BC,109
1,BrC,11
2,CC,13
3,EC,25
4,GBC,61
5,HNC,68
6,KC,13
7,LBC,39
8,LC,194
9,LIC,111


In [None]:
df_cancercounts = df.groupby('Cancer_Short')['Sample_Name'].nunique().reset_index(name='Unique_Sample_Count')
df_cancercounts.to_clipboard()

In [23]:
total_unique_samples_in_df = df['Sample_Name'].nunique()
print(total_unique_samples_in_df)

1375


In [None]:
print(df.shape)

In [24]:
df.head()

Unnamed: 0,Sample_Name,CHROM_x,POS_x,End_x,REF_x,ALT_x,Ref.Gene,Func.ensGene,ExonicFunc.ensGene,AAChange.ensGene,Interpro_domain,avsnp150,CLNDN,CLNDISDB,clinvar: Clinvar,Hotspot,OncoG_TSG,Kinase,Cancer,Cancer_Short
12012,IN-423-TKKC-F-Merged,chr1,1374966,1374966,TG,T,VWA1,exonic,frameshift deletion,ENSG00000179403:ENST00000404702:exon3:c.501del...,.,.,.,.,clinvar: UNK,0,,0,Lung,LC
12013,IN-423-TKKC-F-Merged,chr1,1581065,1581065,G,C,CDK11B,intronic,.,.,.,rs77085542,.,.,clinvar: UNK,0,,0,Lung,LC
12014,IN-423-TKKC-F-Merged,chr1,1961365,1961365,G,T,GABRD,intronic,.,.,.,.,.,.,clinvar: UNK,0,,0,Lung,LC
12015,IN-423-TKKC-F-Merged,chr1,3102848,3102848,C,T,PRDM16,exonic,nonsynonymous SNV,ENSG00000142611:ENST00000270722:exon2:c.C197T:...,.,rs374012976,Left_ventricular_noncompaction_8,"MedGen:C3809288,OMIM:615373",clinvar: Uncertain_significance,0,,0,Lung,LC
12016,IN-423-TKKC-F-Merged,chr1,6515380,6515380,G,T,ESPN,intronic,.,.,.,rs75827056,.,.,clinvar: UNK,0,,0,Lung,LC


---

## Pathogenic Variants Subset Creation

In [None]:
# Define clinvar filters
clinvar_no_conflict = ~df['clinvar: Clinvar '].str.contains("conflict", regex=False, na=False, case=False)
clinvar_patho = df['clinvar: Clinvar '].str.contains("patho", regex=False, na=False, case=False)

# Define Func.ensGene filter
desired_func_values = ['exonic', 'splicing', 'exonic;splicing']
func_ensgene_filter = df['Func.ensGene'].isin(desired_func_values)

# Combine all conditions
patho_only_df = df[clinvar_no_conflict & clinvar_patho & func_ensgene_filter]

In [None]:
print(patho_only_df['Sample_Name'].nunique())
print(patho_only_df.shape)

In [None]:
patho_only_df.to_clipboard()

---

## Oncogene

In [None]:
#--- Pathogenic Oncogene Viz ---#
# Subset for oncogenes
oncogene_df = patho_only_df[patho_only_df["OncoG_TSG"].isin(['Oncogene', 'Both'])]
oncogene_df.head()

In [None]:
print(oncogene_df.shape)
oncogene_df['Sample_Name'].nunique()

In [None]:
# --- Bar Plot 1: Top Genes ---

# Calculate unique sample counts for each gene
unique_gene_counts = oncogene_df.groupby('Ref.Gene')['Sample_Name'].nunique()

#################################################################################
# Calculate the total number of unique samples in the df cohort (1375)
total_unique_samples = total_unique_samples_in_df
#                                       OR                                      #
# Calculate the total number of unique samples in the oncogene_df cohort
# total_unique_samples = oncogene_df['Sample_Name'].nunique()
#################################################################################

# percentage representation based on total unique samples
unique_gene_percentage = (unique_gene_counts / total_unique_samples) * 100

# Select the top genes based on unique sample counts
top_genes_unique = unique_gene_percentage.nlargest(50)
top_counts_unique = unique_gene_counts.loc[top_genes_unique.index]  # Actual unique sample counts

# Plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_genes_unique.index, top_genes_unique.values)

# Annotate each bar with percentage (rotated 90 degrees, outside the bar)
# for bar, gene in zip(bars, top_genes_unique.index):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,  # Shift slightly above
#              f'{bar.get_height():.1f}%', ha='center', va='bottom', 
#              fontsize=10, rotation=90, fontweight='bold')

# Modify x-tick labels: Gene name at 90 degrees
plt.xticks(range(len(top_genes_unique.index)), top_genes_unique.index, rotation=90, ha='center')

plt.ylabel('Sample %')
#plt.title('Top 50 Oncogenes by Unique Sample Frequency')

# Add grid in the background
plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.7)

plt.ylim(0, max(top_genes_unique.values) + 2)  # Adjust y-limits to accommodate labels

# Show plot
plt.show()

# Create a corrected table
table_oncogene_df = pd.DataFrame({
    'Gene': top_genes_unique.index,
    'Unique Sample Count': top_counts_unique.values,
    'Percentage (%)': top_genes_unique.values
})

print(table_oncogene_df)

In [None]:
table_kinase_df.to_clipboard()

---

## TSG

In [None]:
#--- Pathogenic TSG Viz ---#
# Subset for tsg
tsg_df = patho_only_df[patho_only_df["OncoG_TSG"].isin(['TSG', 'Both'])]
tsg_df.head()

In [None]:
print(tsg_df.shape)
tsg_df['Sample_Name'].nunique()

In [None]:
# --- Bar Plot 1: Top Genes ---

# Calculate unique sample counts for each gene
unique_gene_counts = tsg_df.groupby('Ref.Gene')['Sample_Name'].nunique()

#################################################################################
# Calculate the total number of unique samples in the df cohort (1375)
total_unique_samples = total_unique_samples_in_df
#                                       OR                                      #
# Calculate the total number of unique samples in the tsg_df cohort
# total_unique_samples = tsg_df['Sample_Name'].nunique()
#################################################################################

# percentage representation based on total unique samples
unique_gene_percentage = (unique_gene_counts / total_unique_samples) * 100

# Select the top genes based on unique sample counts
top_genes_unique = unique_gene_percentage.nlargest(50)
top_counts_unique = unique_gene_counts.loc[top_genes_unique.index]  # Actual unique sample counts

# Plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_genes_unique.index, top_genes_unique.values)

# Annotate each bar with percentage (rotated 90 degrees, outside the bar)
# for bar, gene in zip(bars, top_genes_unique.index):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,  # Shift slightly above
#              f'{bar.get_height():.1f}%', ha='center', va='bottom', 
#              fontsize=10, rotation=90, fontweight='bold')

# Modify x-tick labels: Gene name at 90 degrees
plt.xticks(range(len(top_genes_unique.index)), top_genes_unique.index, rotation=90, ha='center')

plt.ylabel('Sample %')
#plt.title('Top 50 TSG by Unique Sample Frequency')

# Add grid in the background
plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.7)

plt.ylim(0, max(top_genes_unique.values) + 5)  # Adjust y-limits to accommodate labels

# Show plot
plt.show()

# Create a corrected table
table_tsg_df = pd.DataFrame({
    'Gene': top_genes_unique.index,
    'Unique Sample Count': top_counts_unique.values,
    'Percentage (%)': top_genes_unique.values
})

print(table_tsg_df)

---

# Kinase

In [None]:
#--- Pathogenic Kinase Viz ---#
# Subset for kinase
kinase_df = patho_only_df[patho_only_df["Kinase"].isin([1])]
kinase_df.head()


In [None]:
# --- Bar Plot 1: Top Genes ---

# Calculate unique sample counts for each gene
unique_gene_counts = kinase_df.groupby('Ref.Gene')['Sample_Name'].nunique()

#################################################################################
# Calculate the total number of unique samples in the df cohort (1375)
total_unique_samples = total_unique_samples_in_df
#                                       OR                                      #
# Calculate the total number of unique samples in the kinase_df cohort
# total_unique_samples = kinase_df['Sample_Name'].nunique()
#################################################################################

# percentage representation based on total unique samples
unique_gene_percentage = (unique_gene_counts / total_unique_samples) * 100

# Select the top genes based on unique sample counts
top_genes_unique = unique_gene_percentage.nlargest(50)
top_counts_unique = unique_gene_counts.loc[top_genes_unique.index]  # Actual unique sample counts

# Plot
plt.figure(figsize=(12, 6))
bars = plt.bar(top_genes_unique.index, top_genes_unique.values)

# Annotate each bar with percentage (rotated 90 degrees, outside the bar)
# for bar, gene in zip(bars, top_genes_unique.index):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,  # Shift slightly above
#              f'{bar.get_height():.1f}%', ha='center', va='bottom', 
#              fontsize=10, rotation=90, fontweight='bold')

# Modify x-tick labels: Gene name at 90 degrees
plt.xticks(range(len(top_genes_unique.index)), top_genes_unique.index, rotation=90, ha='center')

plt.ylabel('Sample %')
#plt.title('Top 50 kinase by Unique Sample Frequency')

# Add grid in the background
plt.grid(axis='y', linestyle='--', linewidth=0.7, alpha=0.7)

plt.ylim(0, max(top_genes_unique.values)+ 1)  # Adjust y-limits to accommodate labels

# Show plot
plt.show()

# Create a corrected table
table_kinase_df = pd.DataFrame({
    'Gene': top_genes_unique.index,
    'Unique Sample Count': top_counts_unique.values,
    'Percentage (%)': top_genes_unique.values
})

print(table_kinase_df)

---

## Amino Acid mapping

In [None]:
# # Select columns: 'Ref.Gene', 'AAChange.ensGene', 'avsnp150'
# patho_aa_df = patho_only_df.loc[:, ['Ref.Gene', 'avsnp150', 'AAChange.ensGene']]
# # Drop duplicate rows across all columns
# patho_aa_df = patho_aa_df.drop_duplicates()
# # Filter rows based on column: 'AAChange.ensGene' not begining with .
# patho_aa_df = patho_aa_df[~patho_aa_df['AAChange.ensGene'].str.startswith(".", na=False)]
# len(patho_aa_df)
# # Extract reference and alternate amino acids
# patho_aa_df["Ref_AA"] = patho_aa_df["AAChange.ensGene"].str.extract(r"p\.([A-Z])[0-9]+[A-Z]")[0]
# patho_aa_df["Alt_AA"] = patho_aa_df["AAChange.ensGene"].str.extract(r"p\.[A-Z][0-9]+([A-Z])")[0]
# patho_aa_df.head()
# # Amino acid single-letter to full name mapping
# amino_acid_map = {
#     "A": "Alanine", "R": "Arginine", "N": "Asparagine", "D": "Aspartic acid",
#     "C": "Cysteine", "Q": "Glutamine", "E": "Glutamic acid", "G": "Glycine",
#     "H": "Histidine", "I": "Isoleucine", "L": "Leucine", "K": "Lysine",
#     "M": "Methionine", "F": "Phenylalanine", "P": "Proline", "S": "Serine",
#     "T": "Threonine", "W": "Tryptophan", "Y": "Tyrosine", "V": "Valine"
# }

# # Map single-letter amino acids to full names
# patho_aa_df["Ref_AA_Full"] = patho_aa_df["Ref_AA"].map(amino_acid_map)
# patho_aa_df["Alt_AA_Full"] = patho_aa_df["Alt_AA"].map(amino_acid_map)
# patho_aa_df.head()
# # Amino acid group classification bt side chain
# amino_acid_groups = {
#     "Aliphatic Amino Acids with Hydrophobic Side Chain": ["Alanine", "Valine", "Leucine", "Isoleucine", "Methionine"],
#     "Aromatic Amino Acids with Hydrophobic Side Chain": ["Phenylalanine", "Tryptophan", "Tyrosine"],
#     "Amino Acids with Neutral Side Chain": ["Serine", "Threonine", "Cysteine", "Asparagine", "Glutamine"],
#     "Amino Acids with Negative Charged Side Chain (Acidic)": ["Aspartic acid", "Glutamic acid"],
#     "Amino Acids with Positive Charged Side Chain (Basic)": ["Lysine", "Arginine", "Histidine"],
#     "Imino Acid": ["Proline"],
#     "Unique Amino Acid": ["Glycine"]
# }

# # Reverse mapping for group assignment
# group_mapping = {}
# for group, acids in amino_acid_groups.items():
#     for acid in acids:
#         group_mapping[acid] = group

# # Assign groups based on full names
# patho_aa_df["Ref_Group"] = patho_aa_df["Ref_AA_Full"].map(group_mapping)
# patho_aa_df["Alt_Group"] = patho_aa_df["Alt_AA_Full"].map(group_mapping)

# # Determine substitution type
# def substitution_type(ref_group, alt_group):
#     if ref_group == alt_group:
#         return "Intra"
#     return "Inter"

# patho_aa_df["Substitution_Type"] = patho_aa_df.apply(
#     lambda x: substitution_type(x["Ref_Group"], x["Alt_Group"]), axis=1
# )

# # Display the DataFrame
# print(patho_aa_df)


---

## All Genes Cancer Distribution

In [None]:
# Cancer Counts
patho_only_df.groupby('Cancer_Short')['Sample_Name'].nunique().reset_index(name='Unique_Sample_Count')

In [None]:
patho_only_cancercounts = patho_only_df.groupby('Cancer_Short')['Sample_Name'].nunique().reset_index(name='Unique_Sample_Count')
patho_only_cancercounts.to_clipboard()

In [None]:
patho_only_df.groupby('Cancer_Short')['Ref.Gene'].nunique().reset_index(name='Gene_Count')

In [None]:
patho_only_df.groupby('Ref.Gene')['Sample_Name'].nunique().reset_index(name='Sample_Count').sort_values(by='Sample_Count', ascending=False)

In [None]:
# Count the number of samples each gene is present in
gene_sample_counts = patho_only_df.groupby("Ref.Gene")["Sample_Name"].nunique()

# Find the most common cancer type for each gene
gene_cancer_counts = patho_only_df.groupby(["Ref.Gene", "Cancer_Short"])["Sample_Name"].nunique()
most_common_cancer_per_gene = gene_cancer_counts.groupby(level=0).idxmax().apply(lambda x: x[1])

# Combine results into a dataframe
gene_summary = pd.DataFrame({
    "Sample_Count": gene_sample_counts,
    "Most_Common_Cancer": most_common_cancer_per_gene
})

print(gene_summary)


In [None]:
gene_summary.to_clipboard()

In [None]:
# Count the number of samples each gene is present in
gene_sample_counts = patho_only_df.groupby("Ref.Gene")["Sample_Name"].nunique()

# Find the most common cancer type for each gene
gene_cancer_counts = patho_only_df.groupby(["Ref.Gene", "Cancer_Short"])["Sample_Name"].nunique()
most_common_cancer_per_gene = gene_cancer_counts.groupby(level=0).idxmax().apply(lambda x: x[1])

# Combine results into a dataframe
gene_summary = pd.DataFrame({
    "Sample_Count": gene_sample_counts,
    "Most_Common_Cancer": most_common_cancer_per_gene
})

# Select top genes based on sample count
top_genes = gene_summary.nlargest(50, "Sample_Count")

# Calculate percentage sample count
top_genes["Sample_Percentage"] = (top_genes["Sample_Count"] / 1375) * 100

# Increase figure size for better readability
plt.figure(figsize=(12, 6))
bars = plt.bar(top_genes.index, top_genes["Sample_Percentage"], color="skyblue")

# Annotate bars with most common cancer type
for bar, cancer in zip(bars, top_genes["Most_Common_Cancer"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() +1, cancer, ha="center", va="bottom", fontsize=11, rotation=90, fontweight='bold')

plt.ylim(0, max(top_genes['Sample_Percentage']) + 5)  # Adjust y-limits to accommodate labels

# Formatting
plt.xlabel("Gene")
plt.ylabel("Percentage of Samples")
#plt.title("Top 50 Genes by Sample Percentage with Most Common Cancer Type (Sample Count = 1375)")
plt.xticks(rotation=90, fontsize=10)  # Rotate x-axis labels for better readability
plt.yticks(fontsize=10)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Count the number of samples per cancer type
cancer_sample_counts = df.groupby("Cancer_Short")["Sample_Name"].nunique().sort_values(ascending=False)

# Set up figure
fig, ax = plt.subplots(figsize=(14, 4))

# Generate colors for bars
colors = plt.cm.tab20.colors  # Using tab20 colormap for variety

# Create bar plot
bars = ax.bar(
    cancer_sample_counts.index, 
    cancer_sample_counts.values, 
    color=[colors[i % len(colors)] for i in range(len(cancer_sample_counts))], 
    edgecolor='black'
)

# Annotate bars with sample counts
for bar, count in zip(bars, cancer_sample_counts.values):
    ax.text(
        bar.get_x() + bar.get_width()/2, 
        bar.get_height(), 
        str(count), 
        ha="center", 
        va="bottom", 
        fontsize=9, 
        fontweight="bold"
    )

# Formatting
ax.set_xlabel("Cancer Type", fontsize=12, fontweight="bold")
ax.set_ylabel("# of Samples", fontsize=12, fontweight="bold")
#ax.set_title("Sample Distribution per Cancer Type", fontsize=14, fontweight="bold")
ax.set_xticks(np.arange(len(cancer_sample_counts.index)))
ax.set_xticklabels(cancer_sample_counts.index, rotation=90, fontsize=9)
ax.grid(axis="y", linestyle="--", alpha=0.7)
ax.set_axisbelow(True)

# Increase y-axis limit for better spacing
ax.set_ylim(0, max(cancer_sample_counts.values) * 1.1)  # Add 10% extra space

# Adjust layout for better fit
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Count the number of samples per cancer type
cancer_sample_counts = patho_only_df.groupby("Cancer_Short")["Sample_Name"].nunique().sort_values(ascending=False)

# Set up figure
fig, ax = plt.subplots(figsize=(15, 5))

# Generate colors for bars
colors = plt.cm.tab20.colors  # Using tab20 colormap for variety

# Create bar plot
bars = ax.bar(
    cancer_sample_counts.index, 
    cancer_sample_counts.values, 
    color=[colors[i % len(colors)] for i in range(len(cancer_sample_counts))], 
    edgecolor='black'
)

# Annotate bars with sample counts
for bar, count in zip(bars, cancer_sample_counts.values):
    ax.text(
        bar.get_x() + bar.get_width()/2, 
        bar.get_height(), 
        str(count), 
        ha="center", 
        va="bottom", 
        fontsize=9, 
        fontweight="bold"
    )

# Formatting
ax.set_xlabel("Cancer Type", fontsize=12, fontweight="bold")
ax.set_ylabel("# of Samples", fontsize=12, fontweight="bold")
#ax.set_title("Sample Distribution per Cancer Type in Pathogenic only Subset", fontsize=14, fontweight="bold")
ax.set_xticks(np.arange(len(cancer_sample_counts.index)))
ax.set_xticklabels(cancer_sample_counts.index, rotation=90, fontsize=9)
ax.grid(axis="y", linestyle="--", alpha=0.7)
ax.set_axisbelow(True)
# Increase y-axis limit for better spacing
ax.set_ylim(0, max(cancer_sample_counts.values) * 1.1)  # Add 10% extra space


# Adjust layout for better fit
plt.tight_layout()

# Show plot
plt.show()


---

In [None]:
# --- Calculate the percentage of samples with mutant genes per cancer type --- #

# Count the number of unique samples per cancer type
unique_samples_per_cancer = patho_only_df.groupby('Cancer_Short')['Sample_Name'].nunique()
# Count the number of mutations per cancer type
mutation_counts = patho_only_df['Cancer_Short'].value_counts()
# Count the number of unique genes per cancer type
mutant_gene_counts = patho_only_df.groupby('Cancer_Short')['Ref.Gene'].nunique()


# Calculate the percentage
gene_by_mutation = (mutant_gene_counts / mutation_counts) * 100
mutation_by_gene = (mutation_counts / mutant_gene_counts) * 100
# Calculate the percentage of unique samples with mutant genes per cancer type
gene_by_sample = (mutant_gene_counts / unique_samples_per_cancer) * 100

mutation_by_sample = (mutation_counts / unique_samples_per_cancer ) * 100

# Identify the most frequently mutated gene for each cancer type
most_mutated_gene = patho_only_df.groupby('Cancer_Short')['Ref.Gene'].agg(lambda x: x.value_counts().idxmax())

# Combine the results into a single DataFrame
mutation_summary = pd.DataFrame({
    'Total Unique Samples': unique_samples_per_cancer,
    'Total Mutations': mutation_counts,
    'Unique Mutant Genes': mutant_gene_counts,
    'Gene/Mutations': gene_by_mutation,
    'Mutations/Gene': mutation_by_gene,
    'Genes/Samples': gene_by_sample,
    'Muatations/Samples': mutation_by_sample,
    'Most Mutated Gene': most_mutated_gene
}).reset_index().rename(columns={'index': 'Cancer_Short'})

mutation_summary.head(n=50)


In [None]:
# --- Calculate the percentage of samples with mutant genes per cancer type --- #

# Count the number of unique samples per cancer type
unique_samples_per_cancer_df = df.groupby('Cancer_Short')['Sample_Name'].nunique()
# Count the number of unique samples per cancer type
unique_samples_per_cancer_patho_df = patho_only_df.groupby('Cancer_Short')['Sample_Name'].nunique()

# Identify the most frequently mutated gene for each cancer type
most_mutated_patho_gene = patho_only_df.groupby('Cancer_Short')['Ref.Gene'].agg(lambda x: x.value_counts().idxmax())

# Combine the results into a single DataFrame
mutation_summary = pd.DataFrame({
    'Total Unique Samples': unique_samples_per_cancer_df,
    'Total Unique Samples having Pathogenic Mutation': unique_samples_per_cancer_patho_df,
    'Most Mutated Gene': most_mutated_patho_gene
}).reset_index().rename(columns={'index': 'Cancer_Short'})

mutation_summary.head(n=50)

In [None]:
mutation_summary.to_clipboard()

In [None]:
# Update the bar chart to annotate with most mutated gene
df_updated = mutation_summary

# Calculate the percentage of samples mutated
df_updated["Percentage Mutated"] = (df_updated["Total Unique Samples having Pathogenic Mutation"] / df_updated["Total Unique Samples"]) * 100

# Plot the bar chart
plt.figure(figsize=(12, 6))
bars = plt.barh(df_updated["Cancer_Short"], df_updated["Percentage Mutated"], color="steelblue")
plt.xlabel("Percentage of Samples Mutated")
plt.ylabel("Cancer Type")
#plt.title("Percentage of Unique Samples Having Pathogenic Mutation per Cancer Type with Most Mutated Gene labelled")

# Annotate bars with the most mutated gene
for bar, gene in zip(bars, df_updated["Most Mutated Gene"]):
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, gene, va='center')

plt.gca().invert_yaxis()  # Invert y-axis for better readability

# Add vertical grid lines
plt.grid(axis='x', linestyle='--', alpha=0.4)

# Display the plot
plt.show()


In [None]:
# # Step 1: Count the number of samples per cancer type
# total_samples_per_cancer = patho_only_df.groupby("Cancer")["Sample_Name"].nunique().reset_index()
# total_samples_per_cancer.columns = ["Cancer", "Total_Samples"]

# # Step 2: Count mutations per gene in each cancer type
# mutation_counts = patho_only_df.groupby(["Cancer", "Ref.Gene"])["Sample_Name"].nunique().reset_index()
# mutation_counts.columns = ["Cancer", "Gene", "Mutated_Samples"]

# # Step 3: Merge to get total sample count per cancer type
# mutation_freq = mutation_counts.merge(total_samples_per_cancer, on="Cancer")

# # Step 4: Calculate mutation frequency
# mutation_freq["Mutation_Frequency"] = (mutation_freq["Mutated_Samples"] / mutation_freq["Total_Samples"]) * 100

# # Step 5: Plot the mutation frequencies
# plt.figure(figsize=(20, 6))
# sns.stripplot(data=mutation_freq, x="Cancer", y="Mutation_Frequency", hue="Gene", jitter=True, alpha=0.7)

# # Formatting
# plt.xticks(rotation=90)
# plt.xlabel("Cancer Type")
# plt.ylabel("% of Samples with a Mutant Gene")
# plt.title("Mutations in Cancer Type")
# plt.legend([], [], frameon=False)  # Hide legend for clarity
# # plt.legend(title="Gene", bbox_to_anchor=(1.05, 1), loc='upper left')

# # Show the plot
# plt.show()


In [None]:
# Count the number of unique samples per cancer type
unique_samples_per_cancer = patho_only_df.groupby('Cancer')['Sample_Name'].nunique()

# Count the number of unique mutant genes per cancer type
mutant_gene_counts = patho_only_df.groupby('Cancer')['Ref.Gene'].nunique()

# Identify the most frequently mutated gene for each cancer type
most_mutated_gene = patho_only_df.groupby('Cancer')['Ref.Gene'].agg(lambda x: x.value_counts().idxmax())

# Create a summary DataFrame
mutation_summary_corrected = pd.DataFrame({
    'Total Unique Samples': unique_samples_per_cancer,
    'Unique Mutant Genes': mutant_gene_counts,
    'Most Mutated Gene': most_mutated_gene
}).reset_index().rename(columns={'index': 'Cancer'})

#Plot
plt.figure(figsize=(12, 6))
bars = plt.bar(mutation_summary_corrected['Cancer'], mutation_summary_corrected['Unique Mutant Genes'])

# Add labels for the most mutated gene on top of each bar
for bar, gene in zip(bars, mutation_summary_corrected['Most Mutated Gene']):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), gene, ha='center', va='bottom', fontsize=10, fontweight='bold')

# Customize the plot
plt.xlabel('Cancer Type')
plt.ylabel('Number of Mutations')
plt.title('Number of Mutations per Cancer Type with Top Mutated Gene')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show the plot
plt.show()

---

In [None]:
# Count unique samples per gene and Cancer type
gene_mutation_counts = patho_only_df.groupby(["Ref.Gene", "Cancer_Short"])["Sample_Name"].nunique().reset_index()
gene_mutation_counts.columns = ["Gene", "Cancer_Short", "Unique Mutation Count"]

# Pivot the data for heatmap plotting
heatmap_data = gene_mutation_counts.pivot(index="Gene", columns="Cancer_Short", values="Unique Mutation Count")

# Select the top 30 genes with the highest mutation counts
top_genes = heatmap_data.sum(axis=1).sort_values(ascending=False).head(30).index
heatmap_data_filtered = heatmap_data.loc[top_genes]

# Plot the improved heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(heatmap_data_filtered, cmap="Blues", linewidths=0.5, annot=True, fmt=".0f")

# Set labels and title
plt.xlabel("Cancer Type", fontsize=12)
plt.ylabel("Pathogenic Genes", fontsize=12)
plt.title("Top 30 Mutated Genes Across Cancer Types", fontsize=14)

# Rotate x-axis labels for readability
plt.xticks(rotation=90)
plt.yticks(rotation=0)

# Display the refined heatmap
plt.show()

# Display the heatmap
plt.show()

In [None]:
# Count unique samples per gene and cancer type
gene_mutation_counts = patho_only_df.groupby(["Ref.Gene", "Cancer_Short"])["Sample_Name"].nunique().reset_index()
gene_mutation_counts.columns = ["Gene", "Cancer_Short", "Unique Mutation Count"]

# Count total unique samples per cancer type
total_samples_per_cancer_df = patho_only_df.groupby("Cancer_Short")["Sample_Name"].nunique().reset_index()
total_samples_per_cancer_df.columns = ["Cancer_Short", "Total Unique Samples"]

# Merge the mutation counts with total sample counts
gene_mutation_percentage_df = gene_mutation_counts.merge(total_samples_per_cancer_df, on="Cancer_Short")

# Calculate the mutation percentage per gene and cancer type
gene_mutation_percentage_df["Mutation Percentage"] = (
    gene_mutation_percentage_df["Unique Mutation Count"] / gene_mutation_percentage_df["Total Unique Samples"]
) * 100

# Pivot the data for heatmap plotting
heatmap_data = gene_mutation_percentage_df.pivot(index="Gene", columns="Cancer_Short", values="Mutation Percentage")

# Select the top 30 genes with the highest mutation counts
top_genes = heatmap_data.sum(axis=1).sort_values(ascending=False).head(30).index
heatmap_data_filtered = heatmap_data.loc[top_genes]

# Plot the improved heatmap
plt.figure(figsize=(15, 8))
sns.heatmap(heatmap_data_filtered, cmap="Blues", linewidths=0.5, annot=True, fmt=".0f")

# Set labels and title
plt.xlabel("Cancer Type", fontsize=12)
plt.ylabel("Pathogenic Genes", fontsize=12)
#plt.title("Top 30 Pathogenic Genes Across Cancer Types with counts as percentage", fontsize=14)

# Rotate x-axis labels for readability
plt.xticks(rotation=90)
plt.yticks(rotation=0)

# Display the refined heatmap
plt.show()

# Display the heatmap
plt.show()


In [None]:
heatmap_data_filtered

---

In [None]:
# --- For plotting the ExonicFunc.ensGene distribution per Cancer type --- #

# Filter out rows where ExonicFunc.ensGene is "."
filtered_df = patho_only_df[df["ExonicFunc.ensGene"] != "."]

# Count occurrences of each ExonicFunc.ensGene per Cancer type
filtered_exonic_func_counts = filtered_df.groupby(["Cancer_Short", "ExonicFunc.ensGene"]).size().unstack(fill_value=0)

# Increase figure size for better label alignment
fig, ax = plt.subplots(figsize=(12, 6))

# Define color mapping using Set2 qualitative colormap
set2_colors = cm.get_cmap("Set2", len(filtered_exonic_func_counts.columns)).colors
color_map = {func: set2_colors[i % len(set2_colors)] for i, func in enumerate(filtered_exonic_func_counts.columns)}

# Plot the stacked bar chart with Set2 colormap
filtered_exonic_func_counts.plot(kind="bar", stacked=True, color=[color_map[func] for func in filtered_exonic_func_counts.columns], ax=ax)

# Improve layout and readability
plt.xlabel("Cancer Type")
plt.ylabel("Number of Variants")
#plt.title("Distribution of ExonicFunc.ensGene per Cancer Type (Excluding '.')")
plt.xticks(rotation=45, ha='right')  # Align x-axis labels properly
plt.legend(title="Variant Consequences", bbox_to_anchor=(1, 1))
plt.tight_layout()

# Add grid lines
plt.grid(axis='y', linestyle='--', alpha=0.4)

# Set y-axis ticks at intervals
ax.set_yticks(np.arange(0, filtered_exonic_func_counts.sum(axis=1).max() + 50, 50))

# Show the plot
plt.show()

In [None]:
filtered_exonic_func_counts.head(n=20)


In [None]:
# Calculate the total for each Cancer type
filtered_exonic_func_counts["Total"] = filtered_exonic_func_counts.sum(axis=1)

# Compute the percentage for each exonic term
percentage_filtered_exonic_func_counts = filtered_exonic_func_counts.div(filtered_exonic_func_counts["Total"], axis=0) * 100

# Drop the "Total" column after computing percentages
percentage_filtered_exonic_func_counts.drop(columns=["Total"], inplace=True)

In [None]:
# Increase figure size for better label alignment
fig, ax = plt.subplots(figsize=(12, 6))

# Define color mapping using Set2 qualitative colormap
set2_colors = cm.get_cmap("Set2", len(percentage_filtered_exonic_func_counts.columns)).colors
color_map = {func: set2_colors[i % len(set2_colors)] for i, func in enumerate(percentage_filtered_exonic_func_counts.columns)}

# Plot the stacked bar chart with Set2 colormap
percentage_filtered_exonic_func_counts.plot(kind="bar", stacked=True, color=[color_map[func] for func in percentage_filtered_exonic_func_counts.columns], ax=ax)

# Improve layout and readability
plt.xlabel("Cancer Type")
plt.ylabel("Number of Variants %")
#plt.title("Unique ExonicFunc.ensGene Counts by Pathogenic Gene per Cancer Type")
plt.xticks(rotation=45, ha='right')  # Align x-axis labels properly
plt.legend(title="Variant Consequences", bbox_to_anchor=(1, 1))
plt.tight_layout()

# Set y-axis ticks at intervals of 5
ax.set_yticks(np.arange(0, percentage_filtered_exonic_func_counts.sum(axis=1).max(), 5))

# Add grid lines
plt.grid(axis='y', linestyle='--', alpha=0.4)

# Show the plot
plt.show()

In [None]:
# Select columns: 'Sample_Name', 'Cancer'
patho_samp = patho_only_df.loc[:, ['Sample_Name', 'Cancer']]
patho_samp = patho_samp.drop_duplicates()
patho_samp = patho_samp.rename(columns={'Sample_Name': 'Sample'})
patho_samp = patho_samp.rename(columns={'Cancer': 'Cohort'})
print(patho_samp)
patho_samp.to_csv(r"H:\My Drive\Pathogenic_Landscape\data\absolute\clinical_research_filtered_combined\dataframes\patho_samp.tsv", sep="\t", index= False)

In [None]:
print(patho_only_df.shape)