In [24]:
# ============================================================
# Fair Clinical Model (no tumor_stage)
# ============================================================

import os
os.chdir(r"C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
import joblib

pd.set_option('future.no_silent_downcasting', True)

In [25]:
# ------------------------------------------------------------
# Load dataset
# ------------------------------------------------------------
clinical_path = "data/processed/clinical/clinical_features_with_id.csv"
df = pd.read_csv(clinical_path)
print(f"âœ… Loaded {len(df)} patients from {clinical_path}")

# Keep only patients with metastasis labels
df = df.dropna(subset=["metastasis_status"])
df["metastasis_status"] = df["metastasis_status"].astype(int)
print(f"ðŸ“Š Patients with valid labels: {len(df)}")

# Encode categorical variables
df["gender"] = df["gender"].replace({"male": 0, "female": 1, "unknown": np.nan})
df["gender"] = pd.to_numeric(df["gender"], errors="coerce")
df["age_at_diagnosis"] = pd.to_numeric(df["age_at_diagnosis"], errors="coerce")

# Fill missing values
df = df.assign(
    age_at_diagnosis=df["age_at_diagnosis"].fillna(df["age_at_diagnosis"].median()),
    gender=df["gender"].fillna(df["gender"].mode()[0])
)


âœ… Loaded 633 patients from data/processed/clinical/clinical_features_with_id.csv
ðŸ“Š Patients with valid labels: 522


In [26]:
# ------------------------------------------------------------
# Define features (no tumor_stage)
# ------------------------------------------------------------
num_cols = ["age_at_diagnosis"]
cat_cols = ["gender"]

X = df[num_cols + cat_cols]
y = df["metastasis_status"]


In [27]:
# ------------------------------------------------------------
# Split: train / val / test (70 / 15 / 15)
# ------------------------------------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
print(f"ðŸ§  Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


ðŸ§  Train: 365, Val: 78, Test: 79


In [28]:
# ------------------------------------------------------------
# Define and train model
# ------------------------------------------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", rf)
])

pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
# ------------------------------------------------------------
# Evaluate on validation + test
# ------------------------------------------------------------
for name, X_split, y_split in [("Validation", X_val, y_val), ("Test", X_test, y_test)]:
    y_pred = pipeline.predict(X_split)
    y_prob = pipeline.predict_proba(X_split)[:, 1]
    acc = accuracy_score(y_split, y_pred)
    auc = roc_auc_score(y_split, y_prob)
    print(f"\nðŸ“Š {name} Set:")
    print(f"Accuracy: {acc:.3f} | AUC: {auc:.3f}")
    print(confusion_matrix(y_split, y_pred))
    print(classification_report(y_split, y_pred))



ðŸ“Š Validation Set:
Accuracy: 0.782 | AUC: 0.486
[[59  9]
 [ 8  2]]
              precision    recall  f1-score   support

           0       0.88      0.87      0.87        68
           1       0.18      0.20      0.19        10

    accuracy                           0.78        78
   macro avg       0.53      0.53      0.53        78
weighted avg       0.79      0.78      0.79        78


ðŸ“Š Test Set:
Accuracy: 0.810 | AUC: 0.362
[[64  5]
 [10  0]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.90        69
           1       0.00      0.00      0.00        10

    accuracy                           0.81        79
   macro avg       0.43      0.46      0.45        79
weighted avg       0.76      0.81      0.78        79



In [30]:
# ------------------------------------------------------------
# Feature Importance
# ------------------------------------------------------------
clf = pipeline.named_steps["clf"]
importances = clf.feature_importances_
fi_df = pd.DataFrame({"feature": num_cols + cat_cols, "importance": importances})
print("\nðŸŒŸ Feature Importance:")
print(fi_df.to_string(index=False))



ðŸŒŸ Feature Importance:
         feature  importance
age_at_diagnosis    0.984783
          gender    0.015217


In [31]:
# ------------------------------------------------------------
# Cross-validation
# ------------------------------------------------------------
cv_auc = cross_val_score(pipeline, X, y, cv=5, scoring="roc_auc").mean()
print(f"\nðŸ“ˆ 5-fold Cross-validated AUC: {cv_auc:.3f}")



ðŸ“ˆ 5-fold Cross-validated AUC: 0.445


In [32]:
# ------------------------------------------------------------
# Save model and results
# ------------------------------------------------------------
out_dir = "results/clinical"
os.makedirs(out_dir, exist_ok=True)

model_path = os.path.join(out_dir, "clinical_baseline_rf_no_stage.pkl")
metrics_path = os.path.join(out_dir, "clinical_baseline_metrics_no_stage.csv")
fi_path = os.path.join(out_dir, "clinical_feature_importance_no_stage.csv")

joblib.dump(pipeline, model_path)
pd.DataFrame([{"cv_auc": cv_auc}]).to_csv(metrics_path, index=False)
fi_df.to_csv(fi_path, index=False)

print(f"\nðŸ’¾ Model saved to: {model_path}")
print(f"ðŸ’¾ Metrics saved to: {metrics_path}")
print(f"ðŸ’¾ Feature importance saved to: {fi_path}")

print("\nðŸŽ¯ FINAL SUMMARY")
print(f"Patients used: {len(df)}")
print(f"Features: {num_cols + cat_cols}")


ðŸ’¾ Model saved to: results/clinical\clinical_baseline_rf_no_stage.pkl
ðŸ’¾ Metrics saved to: results/clinical\clinical_baseline_metrics_no_stage.csv
ðŸ’¾ Feature importance saved to: results/clinical\clinical_feature_importance_no_stage.csv

ðŸŽ¯ FINAL SUMMARY
Patients used: 522
Features: ['age_at_diagnosis', 'gender']
