In [None]:
import os
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import pandas as pd
import time
import scipy as sp
import pickle
import musical
# %matplotlib inline
# %load_ext autoreload
# %autoreload 2

import matplotlib.font_manager as fm

# Path to your .ttf file
font_path = "/gpfs/data/courses/aio2025/yb2612/Arial.ttf"
font_prop = fm.FontProperties(fname=font_path)

# Register the font with matplotlib
fm.fontManager.addfont(font_path)

# Set it globally via font name
plt.rcParams['font.family'] = font_prop.get_name()

available_fonts = set(f.name for f in fm.fontManager.ttflist)
print(sorted(available_fonts))

Matplotlib created a temporary cache directory at /tmp/matplotlib-6u5hweus because the default path (/gpfs/home/yb2612/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


# Load X

In [5]:
# pangyn data
pangyn_X = pd.read_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/pangyn_X.csv', index_col=0)
print("mutations, samples:", X.shape)
pangyn_X.head()

mutations, samples: (96, 551)


Unnamed: 0,TCGA-A2-A25D,TCGA-BH-A201,TCGA-EW-A1OV,TCGA-AC-A8OQ,TCGA-A2-A0T4,TCGA-D8-A1JU,TCGA-AR-A2LO,TCGA-A2-A0YL,TCGA-C8-A1HE,TCGA-B6-A1KF,...,TCGA-13-0760.1,TCGA-13-1496,TCGA-36-2545,TCGA-24-1564,TCGA-24-2261,TCGA-20-0996.1,TCGA-23-2077,TCGA-13-0889,TCGA-13-1482,TCGA-10-0926
caaa,129,53,129,134,72,86,100,181,73,154,...,80,108,110,61,249,203,96,73,83,150
caac,129,42,60,69,39,76,141,114,47,72,...,44,68,72,69,132,103,112,62,46,46
caag,25,18,25,29,24,42,8,22,35,21,...,22,28,49,19,70,48,25,34,22,43
caat,67,34,49,90,39,55,48,109,44,48,...,48,69,74,37,127,80,45,44,44,77
caca,100,52,131,87,69,43,84,194,68,73,...,84,80,114,75,158,131,65,70,55,92


In [31]:
# split by cancer type
metadata = pd.read_csv("/gpfs/home/yb2612/yb2612/data/pancancer_metadata.tsv", sep="\t", index_col=0)
metadata["barcode_short"] = metadata.index.str[:12]
metadata

Unnamed: 0_level_0,sample_type,primary_disease,full_disease,disease,barcode_short
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-D3-A1QA-07,Additional Metastatic,skin cutaneous melanoma,Skin Cutaneous Melanoma,SKCM,TCGA-D3-A1QA
TCGA-DE-A4MD-06,Metastatic,thyroid carcinoma,Thyroid carcinoma,THCA,TCGA-DE-A4MD
TCGA-J8-A3O2-06,Metastatic,thyroid carcinoma,Thyroid carcinoma,THCA,TCGA-J8-A3O2
TCGA-J8-A3YH-06,Metastatic,thyroid carcinoma,Thyroid carcinoma,THCA,TCGA-J8-A3YH
TCGA-EM-A2P1-06,Metastatic,thyroid carcinoma,Thyroid carcinoma,THCA,TCGA-EM-A2P1
...,...,...,...,...,...
TCGA-17-Z059-01,,lung adenocarcinoma,Lung adenocarcinoma,LUAD,TCGA-17-Z059
TCGA-17-Z060-01,,lung adenocarcinoma,Lung adenocarcinoma,LUAD,TCGA-17-Z060
TCGA-17-Z061-01,,lung adenocarcinoma,Lung adenocarcinoma,LUAD,TCGA-17-Z061
TCGA-17-Z062-01,,lung adenocarcinoma,Lung adenocarcinoma,LUAD,TCGA-17-Z062


In [7]:
# dict mapping barcode to disease
barcode_to_disease = metadata.set_index("barcode_short")["disease"].to_dict()

In [36]:
# first 12 chars of pangyn samples
# pangyn_X.columns = pangyn_X.columns.str[:12]
pangyn_samples = pangyn_X.columns.tolist()
print("pangyn samples:", len(pangyn_samples))

pangyn samples: 2416


In [39]:
# Create a reverse mapping list to match by prefix
def get_disease_with_partial_match(bc, barcode_dict):
    for key in barcode_dict:
        if bc.startswith(key):  # partial match by prefix
            return barcode_dict[key]
    return "Unknown"

# Apply the partial match function
pangyn_disease_df = pd.DataFrame({
    "barcode_short": pangyn_samples,
    "disease": [get_disease_with_partial_match(bc, barcode_to_disease) for bc in pangyn_samples]
})

# Show counts
print(pangyn_disease_df["disease"].value_counts())

# Set index for downstream usage
pangyn_disease_df.set_index("barcode_short", inplace=True)
pangyn_disease_df

disease
BRCA    1072
UCEC     551
OV       486
CESC     307
Name: count, dtype: int64


Unnamed: 0_level_0,disease
barcode_short,Unnamed: 1_level_1
TCGA-A2-A25D,BRCA
TCGA-BH-A201,BRCA
TCGA-EW-A1OV,BRCA
TCGA-AC-A8OQ,BRCA
TCGA-A2-A0T4,BRCA
...,...
TCGA-20-0996.1,OV
TCGA-23-2077,OV
TCGA-13-0889,OV
TCGA-13-1482,OV


In [42]:
# pivot pangyn_X

pangyn_X_transposed = pangyn_X.T
pangyn_X_transposed_disease = pangyn_X_transposed.merge(pangyn_disease_df, left_index=True, right_index=True, how='left')
pangyn_X_transposed_disease

Unnamed: 0,caaa,caac,caag,caat,caca,cacc,cacg,cact,caga,cagc,...,tgct,tgga,tggc,tggg,tggt,tgta,tgtc,tgtg,tgtt,disease
TCGA-A2-A25D,129,129,25,67,100,50,24,30,105,85,...,10,56,38,197,164,9,14,16,14,BRCA
TCGA-BH-A201,53,42,18,34,52,42,15,35,49,47,...,11,13,14,56,19,2,11,7,11,BRCA
TCGA-EW-A1OV,129,60,25,49,131,98,49,112,111,59,...,4,21,16,114,31,14,13,17,23,BRCA
TCGA-AC-A8OQ,134,69,29,90,87,62,41,81,127,93,...,12,26,22,54,13,8,12,23,15,BRCA
TCGA-A2-A0T4,72,39,24,39,69,53,27,43,49,36,...,19,38,37,146,29,21,32,43,24,BRCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-20-0996.1,203,103,48,80,131,104,35,92,155,138,...,13,11,11,22,17,13,12,21,21,OV
TCGA-23-2077,96,112,25,45,65,56,24,43,65,74,...,14,32,31,160,113,5,14,17,16,OV
TCGA-13-0889,73,62,34,44,70,55,29,48,61,51,...,25,27,29,64,30,20,28,35,44,OV
TCGA-13-1482,83,46,22,44,55,60,27,71,62,48,...,14,41,33,103,56,12,19,21,30,OV


In [43]:
# group df by disease
grouped_by_disease = pangyn_X_transposed_disease.groupby('disease')

# dict of dfs
disease_dfs = {disease: group.drop(columns='disease') for disease, group in grouped_by_disease}

brca_X = disease_dfs.get('BRCA').T
cesc_X = disease_dfs.get('CESC').T
ucec_X = disease_dfs.get('UCEC').T
ov_X = disease_dfs.get('OV').T

data_dir = "/gpfs/data/courses/aio2025/yb2612/data/musical"

brca_X.to_csv(f"{data_dir}/brca_X.csv")
cesc_X.to_csv(f"{data_dir}/cesc_X.csv")
ucec_X.to_csv(f"{data_dir}/ucec_X.csv")
ov_X.to_csv(f"{data_dir}/ov_X.csv")

brca_X

Unnamed: 0,TCGA-A2-A25D,TCGA-BH-A201,TCGA-EW-A1OV,TCGA-AC-A8OQ,TCGA-A2-A0T4,TCGA-D8-A1JU,TCGA-AR-A2LO,TCGA-A2-A0YL,TCGA-C8-A1HE,TCGA-B6-A1KF,...,TCGA-C8-A12W,TCGA-A7-A26J.1,TCGA-BH-A18V,TCGA-BH-A18V.1,TCGA-A7-A13G.1,TCGA-AN-A0FS,TCGA-A7-A26J.2,TCGA-E2-A1IU,TCGA-BH-A42T,TCGA-A2-A04W
caaa,129,53,129,134,72,86,100,181,73,154,...,131,92,66,193,98,86,399,137,69,151
caac,129,42,60,69,39,76,141,114,47,72,...,65,51,45,107,56,36,293,148,51,71
caag,25,18,25,29,24,42,8,22,35,21,...,47,29,24,73,39,15,110,20,29,44
caat,67,34,49,90,39,55,48,109,44,48,...,82,47,40,114,45,43,229,63,50,71
caca,100,52,131,87,69,43,84,194,68,73,...,136,51,63,666,101,66,244,77,48,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tggt,164,19,31,13,29,130,155,187,56,100,...,34,21,14,19,12,27,380,204,55,19
tgta,9,2,14,8,21,37,6,5,4,12,...,18,14,6,20,17,12,80,8,28,13
tgtc,14,11,13,12,32,30,13,11,10,14,...,29,9,16,26,13,20,99,14,29,15
tgtg,16,7,17,23,43,31,23,24,19,24,...,50,18,12,15,31,27,135,24,32,20


# Check X

In [2]:
import pandas as pd
cesc_X = pd.read_csv("/gpfs/data/courses/aio2025/yb2612/data/musical/cesc_X_converted.csv", index_col=0)
ucec_X = pd.read_csv("/gpfs/data/courses/aio2025/yb2612/data/musical/ucec_X_converted.csv", index_col=0)
ov_X = pd.read_csv("/gpfs/data/courses/aio2025/yb2612/data/musical/ov_X_converted.csv", index_col=0)
brca_X = pd.read_csv("/gpfs/data/courses/aio2025/yb2612/data/musical/brca_X_converted.csv", index_col=0)

In [5]:
def has_all_zero_column(df):
    return (df == 0).all().any()

print(has_all_zero_column(cesc_X),
      has_all_zero_column(ucec_X),
      has_all_zero_column(ov_X),
      has_all_zero_column(brca_X)
     )

False False False False


# Converting to A[C>A]A format

In [None]:
# Convert 4-letter context (like "caaa") to trinucleotide format (like "A[C>A]A")
def to_trinuc(row_label):
    base_map = {'a': 'A', 'c': 'C', 'g': 'G', 't': 'T'}
    ref = base_map[row_label[0]]
    alt = base_map[row_label[1]]
    flanking = (base_map[row_label[2]], base_map[row_label[3]])
    return f"{flanking[0]}[{ref}>{alt}]{flanking[1]}"

In [None]:
X = pd.read_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/cesc_X.csv', index_col=0)
X.index = X.index.str.lower().map(to_trinuc)
X.to_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/cesc_X_converted.csv', index=True)

X = pd.read_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/ucec_X.csv', index_col=0)
X.index = X.index.str.lower().map(to_trinuc)
X.to_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/ucec_X_converted.csv', index=True)

X = pd.read_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/ov_X.csv', index_col=0)
X.index = X.index.str.lower().map(to_trinuc)
X.to_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/ov_X_converted.csv', index=True)

X = pd.read_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/brca_X.csv', index_col=0)
X.index = X.index.str.lower().map(to_trinuc)
X.to_csv('/gpfs/data/courses/aio2025/yb2612/data/musical/brca_X_converted.csv', index=True)