In [1]:
!ls /kaggle/input/clinical-dataset

thyroid_clean.csv


In [2]:
# src/data_preprocessing/preprocess_clinical.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

# Set file path for the clinical dataset
data_path = "/kaggle/input/clinical-dataset/thyroid_clean.csv"
df = pd.read_csv(data_path)

# Print basic information
print("Dataset shape:", df.shape)
print(df.head())

# Define the target and feature columns.
# The target "mal" indicates malignancy (0: benign, 1: malignant)
target = "mal"
# Exclude non-predictive columns (e.g., 'id') from features.
features = [col for col in df.columns if col not in ["id", target]]

# Determine categorical and numerical features.
# For example, assume the following columns are categorical:
categorical_features = ["gender", "site", "echo_pattern", "multifocality", "shape",
                        "margin", "calcification", "blood_flow", "composition", "multilateral"]
numerical_features = [col for col in features if col not in categorical_features]

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# Build preprocessing pipelines for numerical and categorical features.
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Build a complete pipeline with an XGBoost classifier.
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        use_label_encoder=False,
        random_state=42))
])

# Split data into training and validation sets.
X = df[features]
y = df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Optional: Hyperparameter tuning with GridSearchCV.
param_grid = {
    "classifier__max_depth": [3, 4, 5],
    "classifier__learning_rate": [0.01, 0.05, 0.1],
    "classifier__n_estimators": [50, 100, 150]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="roc_auc", verbose=1)
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation AUC:", grid_search.best_score_)

# Evaluate on the validation set.
y_pred = grid_search.predict(X_val)
y_prob = grid_search.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_prob)
print("Validation Accuracy:", accuracy)
print("Validation AUC:", auc)
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


Dataset shape: (1232, 20)
   id  age  gender   FT3    FT4    TSH   TPO   TGAb  site  echo_pattern  \
0   1   46       1  4.34  12.41  1.677  0.43   0.98     0             0   
1   2   61       1  5.40  16.26  2.905  0.45   1.91     0             0   
2   3   44       1  3.93  13.39  1.823  9.15  26.25     0             0   
3   5   29       0  3.70  13.98  1.293  0.15   0.81     0             0   
4   6   37       1  3.60  14.56  0.938  0.13  21.22     0             0   

   multifocality  size  shape  margin  calcification  echo_strength  \
0              0   4.6      0       0              0              4   
1              0   4.2      0       1              1              4   
2              0   0.7      0       1              0              4   
3              1   1.0      1       1              1              4   
4              0   0.7      0       1              1              4   

   blood_flow  composition  mal  multilateral  
0           0            1    1             1  
