# Step 2: Machine Learning
Random Forest baseline for cell-type classification and biomarker ranking.

In [None]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

adata_path = Path('data/processed/human_breast_cancer_single_cell.h5ad')
if not adata_path.exists():
    raise FileNotFoundError('Run 01_single_cell_analysis.ipynb first')

adata = sc.read_h5ad(str(adata_path))
X = adata.X.toarray() if hasattr(adata.X, 'toarray') else adata.X

cluster_to_celltype = {
    '0': 'Tumor cells',
    '1': 'T cells',
    '2': 'Fibroblasts',
    '3': 'Endothelial cells',
    '4': 'Macrophages',
    '5': 'NK cells',
    '6': 'B cells',
    '7': 'DC'
}

adata.obs['cell_type'] = [cluster_to_celltype.get(c, 'Unknown') for c in adata.obs['leiden']]
y = adata.obs['cell_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

feature_importance_df = pd.DataFrame({'gene': adata.var_names, 'importance': rf.feature_importances_}).sort_values('importance', ascending=False)
feature_importance_df.to_csv('results/machine_learning/gene_importance.csv', index=False)
joblib.dump(rf, 'results/machine_learning/cell_type_classifier.pkl')

top_genes = feature_importance_df.head(20)
plt.figure(figsize=(12, 8))
plt.barh(range(len(top_genes)), top_genes['importance'])
plt.yticks(range(len(top_genes)), top_genes['gene'].tolist())
plt.xlabel('Importance')
plt.title('Top 20 Important Genes (Biomarkers)')
plt.tight_layout()
plt.savefig('figures/top_biomarkers.png', dpi=300, bbox_inches='tight')
plt.close()

print('Step 2 complete')