In [2]:
import pandas as pd
from pathlib import Path

clinical_path = Path("C:/Users/Negar/Desktop/paper_results/Myself/cr_coad_project/data/raw/tcga/tcga_clinical_manifest.csv")
df = pd.read_csv(clinical_path)
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 633 entries, 0 to 632
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   submitter_id       633 non-null    object 
 1   case_id            633 non-null    object 
 2   primary_diagnosis  633 non-null    object 
 3   gender             633 non-null    object 
 4   age_at_diagnosis   633 non-null    object 
 5   tumor_stage        633 non-null    object 
 6   metastasis_status  522 non-null    float64
 7   vital_status       633 non-null    object 
 8   days_to_death      633 non-null    object 
dtypes: float64(1), object(8)
memory usage: 44.6+ KB


In [3]:
keep_cols = ["gender", "age_at_diagnosis", "tumor_stage", "metastasis_status"]
df = df[keep_cols].copy()

# Convert categorical → numeric
df["gender"] = df["gender"].map({"male": 0, "female": 1})
stage_map = {
    "Stage I": 1, "Stage II": 2, "Stage IIA": 2, "Stage IIB": 2,
    "Stage III": 3, "Stage IIIC": 3, "Stage IV": 4
}
df["tumor_stage"] = df["tumor_stage"].map(stage_map)
df["tumor_stage"].fillna(df["tumor_stage"].median(), inplace=True)

# Target
df["metastasis_status"] = df["metastasis_status"].fillna(0).astype(int)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["tumor_stage"].fillna(df["tumor_stage"].median(), inplace=True)


In [4]:
out_path = Path("C:/Users/Negar/Desktop/paper_results/Myself/cr_coad_project/data/processed/clinical/clinical_features.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print("✅ Saved:", out_path)


✅ Saved: C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project\data\processed\clinical\clinical_features.csv


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["metastasis_status"])
y = df["metastasis_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [7]:
# Convert numeric-looking columns to float
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        try:
            X_train[col] = X_train[col].astype(float)
            X_test[col] = X_test[col].astype(float)
        except:
            # If conversion fails, encode as category codes
            X_train[col] = X_train[col].astype('category').cat.codes
            X_test[col] = X_test[col].astype('category').cat.codes

In [15]:
# Fix for age_at_diagnosis
X_train["age_at_diagnosis"] = pd.to_numeric(X_train["age_at_diagnosis"], errors='coerce')
X_test["age_at_diagnosis"] = pd.to_numeric(X_test["age_at_diagnosis"], errors='coerce')

# Fill any NaNs (from conversion errors) with median
X_train["age_at_diagnosis"].fillna(X_train["age_at_diagnosis"].median(), inplace=True)
X_test["age_at_diagnosis"].fillna(X_train["age_at_diagnosis"].median(), inplace=True)


# Handle missing gender values (replace NaN with mode)
X_train["gender"].fillna(X_train["gender"].mode()[0], inplace=True)
X_test["gender"].fillna(X_train["gender"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train["age_at_diagnosis"].fillna(X_train["age_at_diagnosis"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test["age_at_diagnosis"].fillna(X_train["age_at_diagnosis"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method

In [16]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report

xgb = XGBClassifier(
    n_estimators=200, max_depth=3, learning_rate=0.05, subsample=0.8, random_state=42)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))
print("F1:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9763779527559056
AUC: 0.9298245614035088
F1: 0.8695652173913043
[[114   0]
 [  3  10]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       114
           1       1.00      0.77      0.87        13

    accuracy                           0.98       127
   macro avg       0.99      0.88      0.93       127
weighted avg       0.98      0.98      0.97       127



In [11]:
# Handle missing gender values (replace NaN with mode)
X_train["gender"].fillna(X_train["gender"].mode()[0], inplace=True)
X_test["gender"].fillna(X_train["gender"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train["gender"].fillna(X_train["gender"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test["gender"].fillna(X_train["gender"].mode()[0], inplace=True)


In [None]:
X_train.dtypes
# Detect non-numeric entries in numeric-looking columns
for col in X_train.columns:
    bad_values = X_train[pd.to_numeric(X_train[col], errors='coerce').isna()][col].unique()
    if len(bad_values) > 0:
        print(f"⚠️ Column '{col}' has non-numeric values:", bad_values)

gender              float64
age_at_diagnosis      int16
tumor_stage         float64
dtype: object

In [19]:
# Explainable Boosting Machine
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(X_train, y_train)
print("EBM AUC:", roc_auc_score(y_test, ebm.predict_proba(X_test)[:,1]))


EBM AUC: 0.9278002699055331


In [None]:
# SAVE RESULTS
import os
os.chdir(r"C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project")
print("✅ Working directory:", os.getcwd())

Path("results/clinical").mkdir(parents=True, exist_ok=True)
pd.DataFrame({
    "model": ["xgboost"],
    "accuracy": [accuracy_score(y_test, y_pred)],
    "auc": [roc_auc_score(y_test, y_prob)],
    "f1": [f1_score(y_test, y_pred)]
}).to_csv("results/clinical/eval_metrics.csv", index=False)


✅ Working directory: C:\Users\Negar\Desktop\paper_results\Myself\cr_coad_project
