In [17]:
import pandas as pd
import numpy as np
import scanpy as sc
import statsmodels.formula.api as smf
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, pairwise_distances
from sklearn.preprocessing import StandardScaler

In [2]:
# --- Initial Data Loading and Setup (as before) ---
adata = sc.read_h5ad('/home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_multivi_corrected_rna.h5ad')
adata_preSCT = adata[adata.obs['timepoint_type'] == 'preSCT'].copy()

latent_df = pd.DataFrame(
    adata_preSCT.obsm['X_multivi'],
    columns=[f'LatentFactor_{i+1}' for i in range(23)]
)
latent_df['sample'] = adata_preSCT.obs['sample'].values
latent_df['patient'] = adata_preSCT.obs['patient'].values

In [5]:
# --- Step 1: Feature Engineering - Create 3 Summary Matrices ---

print("--- Generating Feature Matrices ---")
# Hypothesis 1: Average Cellular State (Mean)
summary_matrix_mean = latent_df.groupby('sample').mean(numeric_only=True)

# Hypothesis 2: Cellular Heterogeneity (L2 Spread - Standard Deviation)
summary_matrix_l2 = latent_df.groupby('sample').std(numeric_only=True)

# Hypothesis 3: Cellular Heterogeneity (L1 Spread - Mean Absolute Deviation from the median)
def l1_spread(series):
    return np.mean(np.abs(series - series.median()))

numeric_cols = [f'LatentFactor_{i+1}' for i in range(23)]
summary_matrix_l1 = latent_df.groupby('sample')[numeric_cols].agg(l1_spread)


--- Generating Feature Matrices ---


  summary_matrix_mean = latent_df.groupby('sample').mean(numeric_only=True)
  summary_matrix_l2 = latent_df.groupby('sample').std(numeric_only=True)
  summary_matrix_l1 = latent_df.groupby('sample')[numeric_cols].agg(l1_spread)


In [6]:
print(f"Mean matrix shape: {summary_matrix_mean.shape}")
print(f"L2 Spread (std) matrix shape: {summary_matrix_l2.shape}")
print(f"L1 Spread (mad) matrix shape: {summary_matrix_l1.shape}")

Mean matrix shape: (19, 23)
L2 Spread (std) matrix shape: (19, 23)
L1 Spread (mad) matrix shape: (19, 23)


In [7]:
def run_prognostic_analysis(feature_matrix, sample_info, responder_map):
    """
    Runs a full LOGO cross-validation and bootstrap analysis for a given feature set.
    """
    # Add patient and status info to the feature matrix
    feature_matrix = feature_matrix.join(sample_info[['patient']])
    feature_matrix['status'] = feature_matrix['patient'].map(responder_map)
    feature_matrix.dropna(inplace=True) 

    # Prepare data for scikit-learn
    X = feature_matrix.drop(columns=['patient', 'status'])
    y = (feature_matrix['status'] == 'Non-Responder').astype(int)
    groups = feature_matrix['patient']

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Setup the model and Leave-One-Group-Out cross-validator
    model = LogisticRegression(penalty='l1', solver='liblinear', C=0.5, class_weight='balanced')
    logo = LeaveOneGroupOut()

    # Store predictions from LOGO
    y_true_logo = []
    y_pred_probs_logo = []

    for train_index, test_index in logo.split(X_scaled, y, groups):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        pred_prob = model.predict_proba(X_test)[:, 1]
        
        y_true_logo.extend(y_test.values)
        y_pred_probs_logo.extend(pred_prob)

    # Calculate the overall AUROC from all LOGO predictions
    auroc_pilot = roc_auc_score(y_true_logo, y_pred_probs_logo)

    # Calculate Standard Error using bootstrapping
    n_bootstraps = 1000
    bootstrapped_aurocs = []
    for _ in range(n_bootstraps):
        indices = np.random.choice(len(y_true_logo), size=len(y_true_logo), replace=True)
        if len(np.unique(np.array(y_true_logo)[indices])) < 2:
            continue
        resampled_auroc = roc_auc_score(np.array(y_true_logo)[indices], np.array(y_pred_probs_logo)[indices])
        bootstrapped_aurocs.append(resampled_auroc)

    se_auroc_pilot = np.std(bootstrapped_aurocs)
    
    return {"auroc": auroc_pilot, "se": se_auroc_pilot}

In [8]:
# --- Step 3: Run the Analysis for Each Hypothesis ---

sample_info = latent_df[['sample', 'patient']].drop_duplicates().set_index('sample')
responder_map = {
    'P10': 'Responder', 'P11': 'Responder', 'P12': 'Responder',
    'P09': 'Non-Responder', 'P13': 'Non-Responder', 'P04': 'Non-Responder',
    'P05': 'Non-Responder', 'P06': 'Non-Responder', 'P07': 'Non-Responder',
    'P08': 'Non-Responder', 'P02': 'Non-Responder', 'P01': 'Non-Responder'
}


In [9]:
print("\n--- Running Analyses ---")
results_mean = run_prognostic_analysis(summary_matrix_mean, sample_info, responder_map)
results_l2 = run_prognostic_analysis(summary_matrix_l2, sample_info, responder_map)
results_l1 = run_prognostic_analysis(summary_matrix_l1, sample_info, responder_map)


--- Running Analyses ---


In [10]:
# --- Step 4: Display Results ---

print("\n--- Comparative Results ---")
results_df = pd.DataFrame({
    'Mean State': results_mean,
    'L2 Spread (StDev)': results_l2,
    'L1 Spread (MAD)': results_l1
}).T
results_df['95% CI Half-Width'] = results_df['se'] * 1.96

print(results_df.round(4))


--- Comparative Results ---
                    auroc      se  95% CI Half-Width
Mean State         0.9167  0.0704             0.1380
L2 Spread (StDev)  0.2500  0.1114             0.2184
L1 Spread (MAD)    0.7708  0.1374             0.2693


Cell type based

In [13]:
numeric_cols = [f'LatentFactor_{i+1}' for i in range(23)]
latent_df = pd.DataFrame(adata_preSCT.obsm['X_multivi'], columns=numeric_cols)
latent_df['sample'] = adata_preSCT.obs['sample'].values
latent_df['patient'] = adata_preSCT.obs['patient'].values
latent_df['cell_type'] = adata_preSCT.obs['predicted.annotation'].values

In [26]:
def run_prognostic_analysis(feature_matrix, sample_info, responder_map):
    """
    Runs a full LOGO cross-validation and bootstrap analysis for a given feature set.
    Handles scaling and NaN imputation correctly inside the CV loop.
    """
    feature_matrix = feature_matrix.join(sample_info[['patient']])
    feature_matrix['status'] = feature_matrix['patient'].map(responder_map)
    feature_matrix.dropna(inplace=True, subset=['status'])

    X = feature_matrix.drop(columns=['patient', 'status'])
    y = (feature_matrix['status'] == 'Non-Responder').astype(int)
    groups = feature_matrix['patient']

    model = LogisticRegression(penalty='l1', solver='liblinear', C=0.5, class_weight='balanced')
    logo = LeaveOneGroupOut()

    y_true_logo, y_pred_probs_logo = [], []
    
    for train_index, test_index in logo.split(X, y, groups):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # THE FIX: Scaler is now INSIDE the loop
        scaler = StandardScaler()
        
        # Fit the scaler ONLY on the training data
        X_train_scaled = scaler.fit_transform(X_train)
        # Transform the test data using the scaler fitted on the training data
        X_test_scaled = scaler.transform(X_test)
        
        # THE FIX PART 2: Handle NaNs created by zero-variance columns
        # Replace any NaNs that resulted from scaling with 0
        X_train_scaled = np.nan_to_num(X_train_scaled)
        X_test_scaled = np.nan_to_num(X_test_scaled)
        
        model.fit(X_train_scaled, y_train)
        pred_prob = model.predict_proba(X_test_scaled)[:, 1]
        
        y_true_logo.extend(y_test.values)
        y_pred_probs_logo.extend(pred_prob)

    auroc_pilot = roc_auc_score(y_true_logo, y_pred_probs_logo)

    n_bootstraps = 1000
    bootstrapped_aurocs = []
    for _ in range(n_bootstraps):
        indices = np.random.choice(len(y_true_logo), size=len(y_true_logo), replace=True)
        if len(np.unique(np.array(y_true_logo)[indices])) < 2:
            continue
        resampled_auroc = roc_auc_score(np.array(y_true_logo)[indices], np.array(y_pred_probs_logo)[indices])
        bootstrapped_aurocs.append(resampled_auroc)

    se_auroc_pilot = np.std(bootstrapped_aurocs)
    return {"auroc": auroc_pilot, "se": se_auroc_pilot}

In [27]:
# --- Step 1: Feature Engineering - Test 5 Different Hypotheses ---

print("--- Generating All Feature Matrices ---")

# --- Overall (Global) Features ---
summary_matrix_mean_overall = latent_df.groupby('sample')[numeric_cols].mean()
summary_matrix_l2_overall = latent_df.groupby('sample')[numeric_cols].std()
def l1_spread_univariate(series):
    return np.mean(np.abs(series - series.median()))
summary_matrix_l1_overall = latent_df.groupby('sample')[numeric_cols].agg(l1_spread_univariate)

# --- Cell-Type-Specific Features (Multivariate Spread) ---
def l2_spread_multivariate(df):
    # Calculate the centroid (mean vector) of the cell cloud
    centroid = df.mean(axis=0).values.reshape(1, -1)
    # Calculate Euclidean distance from each cell to the centroid, then average
    distances = pairwise_distances(df, centroid, metric='euclidean')
    return np.mean(distances)

def l1_spread_multivariate(df):
    # Calculate the centroid
    centroid = df.mean(axis=0).values.reshape(1, -1)
    # Calculate Manhattan distance from each cell to the centroid, then average
    distances = pairwise_distances(df, centroid, metric='manhattan')
    return np.mean(distances)


--- Generating All Feature Matrices ---


  summary_matrix_mean_overall = latent_df.groupby('sample')[numeric_cols].mean()
  summary_matrix_l2_overall = latent_df.groupby('sample')[numeric_cols].std()
  summary_matrix_l1_overall = latent_df.groupby('sample')[numeric_cols].agg(l1_spread_univariate)


In [28]:
# Group by sample and cell_type, then apply our multivariate functions
grouped = latent_df.groupby(['sample', 'cell_type'])[numeric_cols]
celltype_l2_long = grouped.apply(l2_spread_multivariate)
celltype_l1_long = grouped.apply(l1_spread_multivariate)

# Unstack to get the wide format (13 features per sample)
summary_matrix_l2_celltype = celltype_l2_long.unstack(level='cell_type')
summary_matrix_l1_celltype = celltype_l1_long.unstack(level='cell_type')

print(f"Cell-Type L2 Spread matrix shape: {summary_matrix_l2_celltype.shape}")
print(f"Cell-Type L1 Spread matrix shape: {summary_matrix_l1_celltype.shape}")

  grouped = latent_df.groupby(['sample', 'cell_type'])[numeric_cols]


Cell-Type L2 Spread matrix shape: (19, 13)
Cell-Type L1 Spread matrix shape: (19, 13)


In [36]:
celltype_l2_wide_dropped = summary_matrix_l2_celltype.dropna(axis=1)
print(f"Shape after dropping NaN columns: {celltype_l2_wide_dropped.shape}")

celltype_l1_wide_dropped = summary_matrix_l1_celltype.dropna(axis=1)
print(f"Shape after dropping NaN columns: {celltype_l1_wide_dropped.shape}")

Shape after dropping NaN columns: (19, 9)
Shape after dropping NaN columns: (19, 9)


In [37]:
sample_info = latent_df[['sample', 'patient']].drop_duplicates().set_index('sample')
responder_map = {
    'P10': 'Responder', 'P11': 'Responder', 'P12': 'Responder',
    'P09': 'Non-Responder', 'P13': 'Non-Responder', 'P04': 'Non-Responder',
    'P05': 'Non-Responder', 'P06': 'Non-Responder', 'P07': 'Non-Responder',
    'P08': 'Non-Responder', 'P02': 'Non-Responder', 'P01': 'Non-Responder'
}

print("\n--- Running All Analyses ---")
# Overall metrics
#results_mean_overall = run_prognostic_analysis(summary_matrix_mean_overall, sample_info, responder_map)
#results_l2_overall = run_prognostic_analysis(summary_matrix_l2_overall, sample_info, responder_map)
#results_l1_overall = run_prognostic_analysis(summary_matrix_l1_overall, sample_info, responder_map)
# Cell-type-specific metrics
results_l2_celltype = run_prognostic_analysis(celltype_l2_wide_dropped, sample_info, responder_map)
results_l1_celltype = run_prognostic_analysis(celltype_l1_wide_dropped, sample_info, responder_map)

# --- Step 3: Display Final Comparative Results ---

print("\n--- Final Comparative Results ---")
final_results_df = pd.DataFrame({
    #'Overall Mean State': results_mean_overall,
    #'Overall L1 Spread (Univariate)': results_l1_overall,
    #'Overall L2 Spread (Univariate)': results_l2_overall,
    'Cell-Type L1 Spread (Multivariate)': results_l1_celltype,
    'Cell-Type L2 Spread (Multivariate)': results_l2_celltype,
}).T
final_results_df['95% CI Half-Width'] = final_results_df['se'] * 1.96

print(final_results_df.round(4))


--- Running All Analyses ---

--- Final Comparative Results ---
                                     auroc      se  95% CI Half-Width
Cell-Type L1 Spread (Multivariate)  0.1458  0.1093             0.2143
Cell-Type L2 Spread (Multivariate)  0.0417  0.0571             0.1119
