In [2]:
# Core Libraries
import pandas as pd
import numpy as np
import shap
import joblib
import warnings 

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score)
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.calibration import calibration_curve
from sklearn.feature_selection import RFE

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")

In [3]:
# Load dataset
df = pd.read_csv("../dataset/dataset.csv")
print(f"✅ Dataset Loaded Successfully. Shape: {df.shape}")

# Preview dataset
df.head()

✅ Dataset Loaded Successfully. Shape: (383, 17)


Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [4]:
# Check for nulls
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Summary statistics
df.describe(include='all').T


Missing Values per Column:
Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Age,383.0,,,,40.866841,15.134494,15.0,29.0,37.0,51.0,82.0
Gender,383.0,2.0,F,312.0,,,,,,,
Smoking,383.0,2.0,No,334.0,,,,,,,
Hx Smoking,383.0,2.0,No,355.0,,,,,,,
Hx Radiothreapy,383.0,2.0,No,376.0,,,,,,,
Thyroid Function,383.0,5.0,Euthyroid,332.0,,,,,,,
Physical Examination,383.0,5.0,Multinodular goiter,140.0,,,,,,,
Adenopathy,383.0,6.0,No,277.0,,,,,,,
Pathology,383.0,4.0,Papillary,287.0,,,,,,,
Focality,383.0,2.0,Uni-Focal,247.0,,,,,,,


In [5]:
df['HighRisk'] = df.apply(
    lambda row: 1 if row['Stage'] in ['IVA', 'IVB'] or row['M'] == 'M1' else 0, axis=1
)
print("\nHighRisk distribution:")
print(df["HighRisk"].value_counts(normalize=True))


HighRisk distribution:
HighRisk
0    0.942559
1    0.057441
Name: proportion, dtype: float64


In [6]:
# Encode all object columns
le_dict = {}
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

In [8]:
# Input Features and Target
predictors = [
    'Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
    'Thyroid Function', 'Physical Examination', 'Adenopathy',
    'Pathology', 'Focality', 'Risk', 'T', 'N', 'M',
    'Stage', 'Response'
]

X = df[predictors]
y = df['Recurred']

# Handle class imbalance with SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print(f"\nAfter SMOTE: {X_res.shape}, target distribution:")
print(pd.Series(y_res).value_counts(normalize=True))

print("X columns:", X.shape[1])
print("y name:", y.name)


After SMOTE: (550, 16), target distribution:
Recurred
0    0.5
1    0.5
Name: proportion, dtype: float64
X columns: 16
y name: Recurred


In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42
)

print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (440, 16), Test shape: (110, 16)


In [10]:
# Training and Evaluation of Models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    ),
    "LightGBM": LGBMClassifier(random_state=42),
}

results = []
trained_models = {}

for name, mdl in models.items():
    mdl.fit(X_train, y_train)
    y_pred = mdl.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    print(f"\n📊 {name} Report:")
    print(classification_report(y_test, y_pred))

    results.append([name, acc, roc_auc])
    trained_models[name] = mdl


📊 RandomForest Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        59
           1       1.00      1.00      1.00        51

    accuracy                           1.00       110
   macro avg       1.00      1.00      1.00       110
weighted avg       1.00      1.00      1.00       110


📊 XGBoost Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        59
           1       0.93      1.00      0.96        51

    accuracy                           0.96       110
   macro avg       0.96      0.97      0.96       110
weighted avg       0.97      0.96      0.96       110

[LightGBM] [Info] Number of positive: 224, number of negative: 216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

In [12]:
# Comparing Result
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC_AUC"])
print("\nModel comparison:")
print(results_df.sort_values("ROC_AUC", ascending=False))

best_name = results_df.sort_values("ROC_AUC", ascending=False).iloc[0]["Model"]
best_model = trained_models[best_name]
print("\nBest Model:", best_model)


Model comparison:
          Model  Accuracy   ROC_AUC
0  RandomForest  1.000000  1.000000
2      LightGBM  0.972727  0.974576
1       XGBoost  0.963636  0.966102

Best Model: RandomForestClassifier(random_state=42)


In [13]:
# Saving Model
joblib.dump((best_model, predictors), "../trained_model/thyroid_recurrence_rf.pkl")
print(f"\n✅ Saved best model: {best_name}")


✅ Saved best model: RandomForest
