In [1]:
import os
import json
import pandas as pdo
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
import scanpy as sc
import pickle
import sys

# Ensure the parent directory is in the system path
sys.path.append('/home/minhang/mds_project/sc_classification')
from utils.experiment_manager import ExperimentManager
from utils.experiment_analysis import ExperimentAnalyzer

# surpress warnings
import warnings
warnings.filterwarnings("ignore")



In [2]:
experiment_id = "20250714_205422_fa_100_random_6dbbde08"
experiments_dir = '/home/minhang/mds_project/sc_classification/experiments/'

experiment_manager = ExperimentManager(experiments_dir)
analyzer = ExperimentAnalyzer(experiment_manager)

In [7]:
try:
    experiment = experiment_manager.load_experiment(experiment_id)
    print(f"Loaded experiment: {experiment_id}")
    print(f'DR method: {experiment.config.get("dimension_reduction.method")}')
    print(f'N components: {experiment.config.get("dimension_reduction.n_components")}')
    print(f'Downsampling method: {experiment.config.get("downsampling.method")}')
    print(f'Downsampling rate: {experiment.config.get("downsampling.rate")}')
    print(f'Downsampling seed: {experiment.config.get("downsampling.seed")}')
    print(f'Downsampling random state: {experiment.config.get("downsampling.random_state")}')
    print(f'Downsampling n_jobs: {experiment.config.get("downsampling.n_jobs")}')
except Exception as e:
    print(f"Error loading experiment {experiment_id}: {e}")
    sys.exit(1)

Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loaded experiment: 20250714_205422_fa_100_random_6dbbde08
DR method: fa
N components: 100
Downsampling method: random
Downsampling rate: None
Downsampling seed: None
Downsampling random state: None
Downsampling n_jobs: None


### Generate unsupervised diagnosis plots (factor sum of square loading and gene communality)

In [4]:
analyzer.generate_unsupervised_fa_report(experiment_id=experiment_id)

--- Generating Unsupervised FA Report for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved SS Loadings plot to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/ss_loadings_per_factor.png
Saved Communality plot to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/gene_communality_distribution.png


In [3]:
# create a list of factors to compare [X_fa_1, X_fa_2, ..., X_fa_100]
factors_to_compare = [f'X_fa_{i}' for i in range(1, 101)]
print("\n--- EDA: Comparing Factor Loading Distributions ---")
analyzer.plot_factor_loading_distributions(
    experiment_id=experiment_id,
    factors_to_compare=factors_to_compare
)


--- EDA: Comparing Factor Loading Distributions ---
--- Plotting Factor Loading Distributions for 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved factor loading distribution plot to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/factor_loading_distributions.png


#### Try to understand if gseapy prerank is scale invariant

In [4]:
# Pick a low-variance factor to inspect
factor_to_debug = 'X_fa_69'

# === Run GSEA on ORIGINAL Loadings with debug ===
print("\nRUNNING GSEA ON ORIGINAL (WITH DEBUG)")
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    rescale_loadings=False,
    debug_factor=factor_to_debug
)

# === Run GSEA on RESCALED Loadings with debug ===
print("\nRUNNING GSEA ON RESCALED (WITH DEBUG)")
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    rescale_loadings=True,
    debug_factor=factor_to_debug
)


RUNNING GSEA ON ORIGINAL (WITH DEBUG)
--- Running GSEA on All Factors (Original Loadings) for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  >>>> DEBUG: Saved ranked list for X_fa_69 to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_factors_original/DEBUG_RANKED_LIST_X_fa_69_Original.csv
  Running GSEA on factor: X_fa_1
  Running GSEA on factor: X_fa_2
  Running GSEA on factor: X_fa_3
  Running GSEA on factor: X_fa_4
  Running GSEA on factor: X_fa_5
  Running GSEA on factor: X_fa_6
  Running GSEA on factor: X_fa_7
  Running GSEA on factor: X_fa_8
  Running GSEA on factor: X_fa_9
  Running GSEA on factor: X_fa_10
  Running GSEA on factor: X_fa_11
  Running GSEA on factor: X_fa_12
  Running GSEA on factor: X_fa_13
  Running GSEA on factor: X_fa_14
  Running GSEA on factor: 

### GSEA on all factors (Rescaled V.S. non-rescaled)

In [None]:
print("\n" + "="*80)
print("RUNNING GSEA ON ORIGINAL, UN-RESCALED FACTOR LOADINGS")
print("="*80)
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    rescale_loadings=False  # This is the default, baseline strategy
)

# === Run GSEA on RESCALED Loadings ===
print("\n" + "="*80)
print("RUNNING GSEA ON RESCALED FACTOR LOADINGS")
print("="*80)
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    rescale_loadings=True   # This activates the new, rescaled strategy
)

print("\nComparison analysis complete.")


RUNNING GSEA ON ORIGINAL, UN-RESCALED FACTOR LOADINGS
--- Running GSEA on All Factors (Original Loadings) for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Running GSEA on factor: X_fa_1
  Running GSEA on factor: X_fa_2
  Running GSEA on factor: X_fa_3
  Running GSEA on factor: X_fa_4
  Running GSEA on factor: X_fa_5
  Running GSEA on factor: X_fa_6
  Running GSEA on factor: X_fa_7
  Running GSEA on factor: X_fa_8
  Running GSEA on factor: X_fa_9
  Running GSEA on factor: X_fa_10
  Running GSEA on factor: X_fa_11
  Running GSEA on factor: X_fa_12
  Running GSEA on factor: X_fa_13
  Running GSEA on factor: X_fa_14
  Running GSEA on factor: X_fa_15
  Running GSEA on factor: X_fa_16
  Running GSEA on factor: X_fa_17
  Running GSEA on factor: X_fa_18
  Running GSEA on factor: X_fa_19
  Running GSEA on factor: X_fa_20
  Running GSEA on factor: X_fa_21
  Running

In [None]:
# --- 1. Identify Top 10 Factors ---
print("Identifying top 10 factors based on Sum of Squared Loadings...")

# Load the transformed data to get the factor loadings
exp = experiment_manager.load_experiment(experiment_id)
dr_method = exp.config.get('dimension_reduction.method')
n_components = exp.config.get('dimension_reduction.n_components')
transformed_adata = sc.read_h5ad(exp.get_path('transformed_data', dr_method=dr_method, n_components=n_components))

loadings = transformed_adata.varm['FA_loadings']
loading_df = pd.DataFrame(loadings, index=transformed_adata.var_names, 
                          columns=[f'X_fa_{i+1}' for i in range(loadings.shape[1])])

# Calculate SS Loadings and get the top 10
ss_loadings = (loading_df ** 2).sum(axis=0).sort_values(ascending=False)
top_10_factors = ss_loadings.head(10).index.tolist()

print(f"Top 10 factors identified: {top_10_factors}")

Identifying top 10 factors based on Sum of Squared Loadings...
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Top 10 factors identified: ['X_fa_1', 'X_fa_2', 'X_fa_3', 'X_fa_5', 'X_fa_4', 'X_fa_6', 'X_fa_7', 'X_fa_8', 'X_fa_9', 'X_fa_10']


In [9]:
print("\n--- Running GSEA on Top 10 Factors ---")

# We will use the default Hallmark gene sets path you provided.
# You can override this by passing the gene_sets_path argument.
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    factor_names=top_10_factors
)

gsea_output_dir = exp.experiment_dir / "analysis" / "gsea_on_factors"
print(f"\nCompleted GSEA analysis.")
print(f"Results are saved in subdirectories within:\n{gsea_output_dir}")


--- Running GSEA on Top 10 Factors ---
--- Running GSEA on Top Unsupervised Factors for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Running GSEA on factor: X_fa_1
    Successfully ran GSEA for X_fa_1. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_top_DR_factors/X_fa_1
  Running GSEA on factor: X_fa_2
    Successfully ran GSEA for X_fa_2. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_top_DR_factors/X_fa_2
  Running GSEA on factor: X_fa_3
    Successfully ran GSEA for X_fa_3. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_top_DR_factors/X_fa_3
  Running GSEA on fact

### ORA and exploratory analysis on gene communality

In [10]:
# --- Calculate Communalities ---
# We can reuse the 'loading_df' from the previous cell
communalities = (loading_df ** 2).sum(axis=1).sort_values(ascending=False)

# Get top 100 best-explained genes
top_100_genes = communalities.head(100)

# Get bottom 100 worst-explained genes
bottom_100_genes = communalities.tail(100).sort_values(ascending=True)

# --- Display Results ---
print("--- Top 100 Genes (Best Explained by the FA Model) ---")
print(top_100_genes)
print("\n" + "="*50 + "\n")
print("--- Bottom 100 Genes (Worst Explained by the FA Model) ---")
print(bottom_100_genes)

--- Top 100 Genes (Best Explained by the FA Model) ---
RUNX2      0.997186
RTL8C      0.996426
RTL1       0.996293
S100A13    0.996238
RTKN       0.996097
             ...   
ONECUT3    0.990045
S100B      0.989970
ATP5F1D    0.989960
SPEF1      0.989959
NHSL1      0.989955
Length: 100, dtype: float64


--- Bottom 100 Genes (Worst Explained by the FA Model) ---
IGLV1-50      0.171423
HFE           0.303391
TRBJ1-4       0.329189
HEYL          0.382181
NPIPB13       0.388604
                ...   
KCNMA1-AS3    0.842713
TMEM270       0.842767
CRIP1         0.843929
COL23A1       0.844667
AC012447.1    0.844693
Length: 100, dtype: float64


In [11]:
analyzer.analyze_communality_extremes(experiment_id=experiment_id, n_genes=100)

--- Analyzing Communality Extremes for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved communality bar plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/top_bottom_communality_genes.png

Running Over-Representation Analysis (ORA)...
  Saved ORA results for Top 100 genes to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/ORA_results_Top_100_genes.csv
  Saved ORA results for Bottom 100 genes to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/ORA_results_Bottom_100_genes.csv


In [12]:
analyzer.analyze_communality_extremes(experiment_id=experiment_id, n_genes=100)
exp = experiment_manager.load_experiment(experiment_id)

# Now, plot the results from the generated CSVs
unsupervised_dir = exp.experiment_dir / "analysis" / "unsupervised_dr_analysis"

# Plot for Top 100 Genes
top_genes_csv = unsupervised_dir / "ORA_results_Top_100_genes.csv"
top_genes_plot = unsupervised_dir / "ORA_plot_Top_100_genes.png"
if top_genes_csv.exists():
    analyzer.plot_ora_results(top_genes_csv, top_genes_plot)

# Plot for Bottom 100 Genes
bottom_genes_csv = unsupervised_dir / "ORA_results_Bottom_100_genes.csv"
bottom_genes_plot = unsupervised_dir / "ORA_plot_Bottom_100_genes.png"
if bottom_genes_csv.exists():
    analyzer.plot_ora_results(bottom_genes_csv, bottom_genes_plot)

--- Analyzing Communality Extremes for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved communality bar plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/top_bottom_communality_genes.png

Running Over-Representation Analysis (ORA)...
  Saved ORA results for Top 100 genes to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/ORA_results_Top_100_genes.csv
  Saved ORA results for Bottom 100 genes to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/ORA_results_Bottom_100_genes.csv
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  - Saved ORA plot to

## Generate 2-panels report per patient

In [13]:
print("Running LR-Lasso analysis...")
analyzer.generate_lasso_path_2panels_report(experiment_id)  
print("LR-Lasso analysis complete!")

# Check what was created
summary_plots_dir = experiment.get_path('summary_plots')
print(f"\nResults saved to: {summary_plots_dir}")
if summary_plots_dir.exists():
    print("Files created:")
    for file in summary_plots_dir.glob("*.png"):
        print(f"  - {file.name}")

Running LR-Lasso analysis...
--- Generating Standard Analysis Report for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Generating plot for patient: P01
  Saved plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/patient_P01_metrics_and_coefficients.png
Generating plot for patient: P02
  Saved plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/patient_P02_metrics_and_coefficients.png
Generating plot for patient: P03
  Saved plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/patient_P03_metrics_and_coefficients.png
Generati

#### Generate multiVI-based static / selected FA-based dynamic UMAP for selected reg indices 

In [14]:
# Define which patients and alpha indices to analyze
# The keys are patient IDs, and the values are lists of 1-based alpha indices.
patient_indices_to_analyze = {
    "P01": [12, 13, 14],
    "P02": [14, 15, 16],
    "P03": [17], 
    "P04": [11, 12, 13, 14, 15],
    "P05": [9, 10, 11, 12, 13],
    "P06": [9, 10, 11, 12, 13, 14], 
    "P07": [10, 11, 12, 13],
    "P09": [11, 12, 13, 14, 15, 16, 17],
    "P13": [10, 11, 12, 13]
}

# Run the report generation
# This will create and save all the plots in a new directory inside your experiment folder.
analyzer.generate_classification_umap_report(
    experiment_id=experiment_id,
    patient_reg_strength_indices=patient_indices_to_analyze,
    static_umap_rep='X_multivi'  # Specify the obsm key for the static UMAP
)


print("Analysis complete. Check the 'analysis/classification_umaps' directory in your experiment folder.")

--- Generating Classification UMAP Report for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08

Processing patient: P01
  Generating Static UMAPs based on: 'X_multivi'
  Finished static UMAPs for patient P01.
  Generating Dynamic UMAPs based on selected 'fa' factors
  Finished dynamic UMAPs for patient P01.

Processing patient: P02
  Generating Static UMAPs based on: 'X_multivi'
  Finished static UMAPs for patient P02.
  Generating Dynamic UMAPs based on selected 'fa' factors
    Index 16: Only 1 active factor. Attempting fallback.
  Finished dynamic UMAPs for patient P02.

Processing patient: P03
  Generating Static UMAPs based on: 'X_multivi'
  Finished static UMAPs for patient P03.
  Generating Dynamic UMAPs based on selected 'fa' factors
    Index 17: Only 1 active factor. Attempting fallback.
  Finished dynamic UMAPs for patient P03.

Processing patient: P0

#### Generate JSON files from classification transition analysis between each pair of indicies picked

In [None]:
print("\n--- Running Classification Transition Analysis (generating the JSON files) ---")

patient_indices_to_analyze = {
    "P01": [12, 13, 14],
    "P02": [14, 15, 16],
    "P03": [17], 
    "P04": [11, 12, 13, 14, 15],
    "P05": [9, 10, 11, 12, 13],
    "P06": [9, 10, 11, 12, 13, 14], 
    "P07": [10, 11, 12, 13],
    "P09": [11, 12, 13, 14, 15, 16, 17],
    "P13": [10, 11, 12, 13]
}

for patient_id, indices in patient_indices_to_analyze.items():
    analyzer.analyze_classification_transitions(
        experiment_id=experiment_id,
        patient_id=patient_id,
        indices_to_check=indices
    )

print("\nTransition analysis complete for all specified patients.")


--- Running Classification Transition Analysis ---
--- Analyzing Classification Transitions for Patient P01 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Analyzing transition from index 14 to 13...
  Analyzing transition from index 13 to 12...

Successfully saved transition analysis to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/models/classification/P01/classification_transitions.json
--- Analyzing Classification Transitions for Patient P02 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Analyzing transition from index 16 to 15...
  Analyzing transition from index 15 to 14...

Successfully saved transition analysis to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/models/classification/P02/classification_transitions.json
--- Anal

In [4]:
print("\n--- Generating Detailed Reports for Each Transition ---")

# We can reuse the `patient_indices_to_analyze` dictionary.
for patient_id in patient_indices_to_analyze.keys():
    # This function will print a detailed report to the notebook output.
    # It also prepares the ground for saving more structured results if needed later.
    analyzer.report_on_classification_transitions(
        experiment_id=experiment_id,
        patient_id=patient_id
    )

print("\nFinished generating all transition reports.")


--- Generating Detailed Reports for Each Transition ---
--- Reporting on Classification Transitions for Patient P01 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08

ANALYZING TRANSITION: Index 14 -> 13
Discovery Factors: ['X_fa_10', 'X_fa_28', 'X_fa_32', 'X_fa_50', 'X_fa_57', 'X_fa_59', 'X_fa_67', 'X_fa_74', 'X_fa_92', 'X_fa_99']
Cells Flipped to Correct: 84

  Running GSEA on discovery factors...
--- Running GSEA on Top Unsupervised Factors for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Running GSEA on factor: X_fa_10
    Successfully ran GSEA for X_fa_10. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_top_DR_factors/X_fa_10
  Running GSEA on factor: X_fa_28
    Successfully ran GS

Test on low variance, high importance factors' GSEA strategy [1. on the original factor; 2. on the predictive loading]

In [5]:
patient_to_test = 'P05'
low_variance_factor = 'X_fa_63'
important_alpha_idx = 13

# --- A) Run standard GSEA on the low-variance factor ---
print(f"\n--- Running UNSUPERVISED GSEA on {low_variance_factor} ---")
analyzer.run_gsea_on_factors(
    experiment_id=experiment_id,
    factor_names=[low_variance_factor]
)

# --- B) Run SUPERVISED GSEA on the Predictive Loading ---
print(f"\n--- Running SUPERVISED GSEA for {patient_to_test} at alpha index {important_alpha_idx} ---")
analyzer.run_gsea_on_predictive_loading(
    experiment_id=experiment_id,
    patient_id=patient_to_test,
    alpha_index=important_alpha_idx
)

print("\n--- Comparison Complete ---")
print(f"1. Check the GSEA plot for '{low_variance_factor}' in 'analysis/factor_interpretation/gsea_on_top_DR_factors/'")
print(f"2. Check the GSEA plot in 'analysis/supervised_gsea/{patient_to_test}/alpha_idx_{important_alpha_idx}/'")
print("3. Compare the NES scores and p-values. Pathways should be more significant in the supervised result.")


--- Running UNSUPERVISED GSEA on X_fa_63 ---
--- Running GSEA on Top Unsupervised Factors for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Running GSEA on factor: X_fa_63
    Successfully ran GSEA for X_fa_63. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/factor_interpretation/gsea_on_top_DR_factors/X_fa_63

--- Running SUPERVISED GSEA for P05 at alpha index 13 ---
--- Running GSEA on Predictive Loading for Patient P05 at alpha index 13 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Calculating predictive loading vector...
  Running GSEA...
  Successfully ran supervised GSEA. Results in /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/supervised_gsea/P05/a

#### Overall across patient MRD heterogeneity

In [3]:
experiment_id = "20250714_205422_fa_100_random_6dbbde08" # Replace with your actual experiment ID

# Generate the accuracy vs. number of factors plot
analyzer.plot_accuracy_vs_n_factors_summary(
    experiment_id=experiment_id,
    metric='overall_accuracy'  # You can change this to 'roc_auc', 'mal_accuracy', etc.
)

# You can also generate one for ROC AUC
analyzer.plot_accuracy_vs_n_factors_summary(
    experiment_id=experiment_id,
    metric='roc_auc'
)

--- Generating Accuracy vs. Number of Factors Summary for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved summary plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/summary_accuracy_vs_n_factors_overall_accuracy.png
--- Generating Accuracy vs. Number of Factors Summary for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved summary plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_ra

In [4]:
# You can also generate one for ROC AUC
analyzer.plot_accuracy_vs_n_factors_summary(
    experiment_id=experiment_id,
    metric='mal_accuracy'
)

--- Generating Accuracy vs. Number of Factors Summary for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved summary plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/summary_accuracy_vs_n_factors_mal_accuracy.png


In [3]:
# Generate the bar plot for factors needed for good performance
analyzer.plot_factors_for_good_accuracy_summary(
    experiment_id=experiment_id,
    margin=0.0 # Default: accuracy just needs to be > trivial accuracy
)

'''
# Example with a stricter margin
analyzer.plot_factors_for_good_accuracy_summary(
    experiment_id=experiment_id,
    margin=0.05 # Accuracy needs to be at least 5% > trivial accuracy
)
'''

--- Generating 'Factors for Good Accuracy' Summary for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved summary bar plot to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/summary_factors_for_good_accuracy.png


'\n# Example with a stricter margin\nanalyzer.plot_factors_for_good_accuracy_summary(\n    experiment_id=experiment_id,\n    margin=0.05 # Accuracy needs to be at least 5% > trivial accuracy\n)\n'

In [4]:
experiment_id = "20250714_205422_fa_100_random_6dbbde08"
analyzer.plot_patient_cluster_w_factor_coefs_at_best_reg_strength(
    experiment_id=experiment_id
)

--- Clustering Patients by Factor Coefficients for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Saved patient clustering heatmap to /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/patient_factor_coefficient_clustermap.png


#### Visualize YH's DEG genes in factor space

In [3]:
experiment_id = "20250714_205422_fa_100_random_6dbbde08" # Replace with your actual experiment ID
genes_to_visualize = ['PTH2R', 'GNG11', 'SH2D1A', 'DLGAP2', 'ST3GAL1']

# Generate the gene visualization report
analyzer.visualize_gene_in_factor_space(
    experiment_id=experiment_id,
    genes_of_interest=genes_to_visualize
)

--- Visualizing Genes in Factor Space for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08

--- Communality Scores ---
  - PTH2R: Not found in the model's gene list.
  - GNG11: Not found in the model's gene list.
  - SH2D1A: Not found in the model's gene list.
  - DLGAP2: 0.9390
  - ST3GAL1: Not found in the model's gene list.

Saved gene loading heatmap to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/gene_loadings_heatmap_DLGAP2.png


---
### Deeper Dive into Model Structure

To better understand patient heterogeneity, we will now move beyond looking at factor coefficients and analyze the model in two new ways:

1.  **Factor Similarity Analysis:** We will check if the model has learned redundant factors. If different factors are built from the same genes, they are biologically equivalent. We will analyze this with two methods: direct correlation of factor loading vectors and the overlap of their top contributing genes.
2.  **Patient Clustering in Gene Space:** To get a more biologically meaningful grouping of patients, we will calculate a "predictive gene signature" for each patient (`signature = Loadings @ Coefficients`). We will then cluster the patients based on these gene-level signatures. This should reveal groups of patients who rely on similar biological programs for classification, even if they use different combinations of factors to get there.

---

In [3]:
# --- Set the experiment ID ---
experiment_id = "20250714_205422_fa_100_random_6dbbde08" # Replace with your actual experiment ID

# Analyze the similarity between the learned factors
analyzer.analyze_factor_similarity(
    experiment_id=experiment_id,
    top_n_genes=200 # Number of top genes to use for the overlap calculation
)

--- Analyzing Factor Similarity for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
  Calculating factor-factor correlation...
  Saved factor correlation heatmap to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/factor_loading_correlation_heatmap.png
  Calculating top 200 gene overlap (Jaccard Index)...
  Saved gene overlap heatmap to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/unsupervised_dr_analysis/factor_top_gene_overlap_heatmap.png


In [4]:
# --- Use the same experiment ID ---
experiment_id = "20250714_205422_fa_100_random_6dbbde08" # Replace with your actual experiment ID

# Cluster patients based on their "predictive loading" in gene space
analyzer.plot_patient_cluster_in_gene_space(
    experiment_id=experiment_id,
    n_top_genes=100 # Use the top 100 most variable genes for the heatmap
)

--- Clustering Patients in Gene Space for Experiment: 20250714_205422_fa_100_random_6dbbde08 ---
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Loading experiment from: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08
Clustering patients based on the top 100 most variable signature genes.
Saved gene-space clustering heatmap to: /home/minhang/mds_project/sc_classification/experiments/20250714_205422_fa_100_random_6dbbde08/analysis/summary_plots/patient_gene_space_clustermap.png
