In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os 



In [2]:
# --- Configuration (match these with your pipeline script if paths are shared) ---
ADATA_PATH = '/home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_multivi_corrected_rna.h5ad'
PREPROCESSED_ADATA_OUTPUT_PATH = '/home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_mrd_hvg_std_may31.h5ad' # Choose your output path

TIMEPOINT_COL = 'timepoint_type'
MRD_LABEL = 'MRD'
TARGET_COL = 'CN.label' # Used for filtering, not for scaling targets
TARGET_VALUE_POSITIVE = 'cancer'
TARGET_VALUE_NEGATIVE = 'normal'
N_TOP_GENES = 3000 # Number of highly variable genes

# --- 1. Load Full Data ---
print(f"Loading AnnData from: {ADATA_PATH}")
adata_full = sc.read_h5ad(ADATA_PATH)
print(f"Full AnnData loaded: {adata_full.shape[0]} cells, {adata_full.shape[1]} features")

# --- 2. Filter for MRD Analysis ---
print(f"\nFiltering for '{MRD_LABEL}' timepoint and cells labeled as '{TARGET_VALUE_POSITIVE}' or '{TARGET_VALUE_NEGATIVE}'.")
mrd_mask = adata_full.obs[TIMEPOINT_COL] == MRD_LABEL
valid_target_mask = adata_full.obs[TARGET_COL].isin([TARGET_VALUE_POSITIVE, TARGET_VALUE_NEGATIVE]) & \
                    ~adata_full.obs[TARGET_COL].isna()
adata_mrd_filtered = adata_full[mrd_mask & valid_target_mask].copy()
print(f"Filtered AnnData for MRD analysis: {adata_mrd_filtered.n_obs} cells, {adata_mrd_filtered.n_vars} initial genes.")

if adata_mrd_filtered.n_obs == 0:
    raise ValueError("No cells found after initial filtering for MRD and target labels.")

# --- 3. Select Highly Variable Genes (HVGs) ---
print(f"\nSelecting top {N_TOP_GENES} highly variable genes...")
adata_mrd_hvg = adata_mrd_filtered.copy() # Work on a copy for HVG selection

if N_TOP_GENES < adata_mrd_hvg.n_vars:
    try:
        if np.any(adata_mrd_hvg.X < 0) and np.any(adata_mrd_hvg.X % 1 != 0): # Check for negatives and decimals
            print("Data contains negative/decimal values. Using simple variance for HVG selection.")
            gene_variances = np.var(adata_mrd_hvg.X.toarray() if hasattr(adata_mrd_hvg.X, 'toarray') else adata_mrd_hvg.X, axis=0)
            hvg_indices = np.argsort(gene_variances)[-N_TOP_GENES:]
            adata_mrd_hvg = adata_mrd_hvg[:, hvg_indices].copy()
        else: # Attempt seurat_v3 if data seems more count-like or positive log-transformed
            print("Attempting HVG selection with flavor 'seurat_v3'.")
            sc.pp.highly_variable_genes(
                adata_mrd_hvg,
                n_top_genes=N_TOP_GENES,
                flavor='seurat_v3', # Be mindful of data type requirements for this flavor
                subset=True
            )
        print(f"Selected {adata_mrd_hvg.n_vars} HVGs.")
    except Exception as e_hvg:
        print(f"Error during HVG selection: {e_hvg}. Falling back to simple variance.")
        gene_variances = np.var(adata_mrd_hvg.X.toarray() if hasattr(adata_mrd_hvg.X, 'toarray') else adata_mrd_hvg.X, axis=0)
        hvg_indices = np.argsort(gene_variances)[-N_TOP_GENES:]
        adata_mrd_hvg = adata_mrd_hvg[:, hvg_indices].copy()
        print(f"Selected top {adata_mrd_hvg.n_vars} HVGs by simple variance.")
else:
    print("Number of genes is less than or equal to N_TOP_GENES, using all genes.")
print(f"Data shape after HVG selection: {adata_mrd_hvg.shape}")

# --- 4. Standardize HVG Data ---
print(f"\nStandardizing HVG data (genes to mean=0, variance=1)...")
# Ensure X is suitable for StandardScaler (e.g., dense float)
if hasattr(adata_mrd_hvg.X, 'toarray'):
    X_for_scaling = adata_mrd_hvg.X.toarray()
else:
    X_for_scaling = adata_mrd_hvg.X.copy() # Make a copy if it's already dense

if X_for_scaling.shape[1] > 0: # Only scale if there are features
    scaler = StandardScaler(with_mean=True) # Use with_mean=True for dense corrected data
    X_scaled = scaler.fit_transform(X_for_scaling)
    
    # Create a new AnnData for the scaled data to keep original HVG data if needed,
    # or overwrite adata_mrd_hvg.X
    adata_mrd_hvg_std = adata_mrd_hvg.copy()
    adata_mrd_hvg_std.X = X_scaled
    
    # Optionally, store the scaler if you ever need to inverse_transform
    # (not typically needed for FA input but good practice for other ML)
    # adata_mrd_hvg_std.uns['gene_scaler'] = scaler 
    print("Standardization complete.")
    print(f"Shape of standardized data: {adata_mrd_hvg_std.shape}")
else:
    print("No features to scale. Skipping standardization.")
    adata_mrd_hvg_std = adata_mrd_hvg.copy()


Loading AnnData from: /home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_multivi_corrected_rna.h5ad
Full AnnData loaded: 192149 cells, 36601 features

Filtering for 'MRD' timepoint and cells labeled as 'cancer' or 'normal'.
Filtered AnnData for MRD analysis: 69801 cells, 36601 initial genes.

Selecting top 3000 highly variable genes...
Attempting HVG selection with flavor 'seurat_v3'.




Selected 3000 HVGs.
Data shape after HVG selection: (69801, 3000)

Standardizing HVG data (genes to mean=0, variance=1)...
Standardization complete.
Shape of standardized data: (69801, 3000)


In [11]:
import pickle

In [12]:
model_objects_save_dir = '/home/minhang/mds_project/sc_classification/pipeline/'
# saving the standard scaler object
scaler_path = os.path.join(model_objects_save_dir, f"mrd_std_scaler_may31.pkl") # n_factors in filename for consistency
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

In [13]:
# save the highly variable gene names 
hvg_genes = adata_mrd_hvg_std.var_names.tolist()
hvg_genes_path = os.path.join(model_objects_save_dir, f"mrd_hvg_genes_may31.pkl")
with open(hvg_genes_path, 'wb') as f:
    pickle.dump(hvg_genes, f)

In [3]:
import os

In [4]:
# --- 5. Save Preprocessed Data ---
# Ensure output directory exists
output_dir = os.path.dirname(PREPROCESSED_ADATA_OUTPUT_PATH)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

adata_mrd_hvg_std.write_h5ad(PREPROCESSED_ADATA_OUTPUT_PATH)
print(f"\nPreprocessed and standardized AnnData saved to: {PREPROCESSED_ADATA_OUTPUT_PATH}")


Preprocessed and standardized AnnData saved to: /home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_mrd_hvg_std_may31.h5ad


In [5]:
gene_variances = np.var(adata_mrd_hvg_std.X, axis=0)

In [6]:
print(pd.Series(gene_variances).describe())

count    3000.000000
mean        0.999988
std         0.000022
min         0.999698
25%         0.999985
50%         0.999990
75%         0.999994
max         1.000798
dtype: float64


In [8]:
PREPROCESSED_ADATA_PATH = '/home/minhang/mds_project/data/cohort_adata/multiVI_model/adata_mrd_hvg_std_may31.h5ad'
adata_mrd_preprocessed = sc.read_h5ad(PREPROCESSED_ADATA_PATH)

In [9]:
adata_mrd_preprocessed

AnnData object with n_obs × n_vars = 69801 × 3000
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'library', 'exp.ID', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_dsb', 'nFeature_dsb', 'nCount_ADT', 'nFeature_ADT', 'hash.ID', 'scDblFinder.score', 'scDblFinder.weighted', 'scDblFinder.cxds_score', 'Lane', 'patient', 'marker', 'Time', 'batch', 'Tech', 'sample', 'source', 'soup.singlet_posterior', '_indices', '_scvi_batch', '_scvi_labels', 'CN.label', 'predicted.annotation.score', 'predicted.annotation', 'predicted.pseudotime.score', 'predicted.pseudotime', 'timepoint_type'
    var: 'ID', 'modality', 'chr', 'start', 'end', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'CN.label_colors', 'Tech_colors', 'draw_graph', 'hvg', 'neighbors', 'patient_colors', 'predicted.annotation_colors', 'sample_colors', 'timepoint_type_colors', 'umap'
    obsm: 'X_draw_graph_fa', 'X_multivi', 'X_umap'
    obsp: 'connectivities', 'distances'