In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# Load the gene-level matrix you created in Step 1
gse_id = "GSE2034"
DATA_PROCESSED = Path("../data/processed")

expr_gene = pd.read_parquet(DATA_PROCESSED / f"{gse_id}_expr_gene.parquet")
print("Original gene-level shape:", expr_gene.shape)

Original gene-level shape: (13237, 286)


In [2]:
# Add small constant to avoid log(0)
expr_log = np.log2(expr_gene + 1)
print("After log2 transform:", expr_log.shape)

After log2 transform: (13237, 286)


In [3]:
expr_filled = expr_log.fillna(expr_log.median(axis=1), axis=0)
missing_count = np.isnan(expr_log.values).sum()
print("Missing values filled:", missing_count)

Missing values filled: 0


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Scale per sample â†’ so we transpose
expr_scaled = pd.DataFrame(
    scaler.fit_transform(expr_filled.T).T,
    index=expr_filled.index,
    columns=expr_filled.columns
)

In [5]:
gene_variance = expr_scaled.var(axis=1)

threshold = gene_variance.quantile(0.50)  
expr_filtered = expr_scaled.loc[gene_variance >= threshold]

In [6]:
expr_filtered.to_parquet(DATA_PROCESSED / "GSE2034_preprocessed.parquet")
expr_filtered.to_csv(DATA_PROCESSED / "GSE2034_preprocessed.csv")