In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_recall_curve, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

Define allowed columns

In [3]:
ALLOWED_NUMERIC = ["SMOKYRS", "PACKSPER"]
ALLOWED_CATEGORICAL = [
    "SEX", "HISPANIC", "RACE", "PRIMLANG", "EDUC", "MARISTAT", "HANDED",
    "NACCLIVS", "RESIDENC", "INLIVWTH", "INVISITS", "INCALLS",
    "INRELY", "INEDUC", "INRELTO", "TOBAC30", "TOBAC100",
    "ALCOCCAS", "ALCFREQ", "NACCFAM", "NACCMOM", "NACCDAD"
]
ALLOWED_STRING = ["RACEX", "RACESECX", "RACETERX", "PRIMLANX", "INRELTOX"]
ALLOWED_COLUMNS = ALLOWED_NUMERIC + ALLOWED_CATEGORICAL + ALLOWED_STRING
LABEL_COL = "DEMENTED"

Preprocessing

In [4]:
def init_preprocessing_objects():
    imputer_num = SimpleImputer(strategy='mean')
    imputer_cat = SimpleImputer(strategy='most_frequent')
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    scaler = StandardScaler()
    return imputer_num, imputer_cat, encoder, scaler

def preprocess_df(df, imputer_num, imputer_cat, encoder, scaler, is_train=True):
    df = df[ALLOWED_COLUMNS].copy()
    # Numeric
    if is_train:
        df[ALLOWED_NUMERIC] = imputer_num.fit_transform(df[ALLOWED_NUMERIC])
    else:
        df[ALLOWED_NUMERIC] = imputer_num.transform(df[ALLOWED_NUMERIC])
    # Categorical + string
    cat_cols = ALLOWED_CATEGORICAL + ALLOWED_STRING
    if is_train:
        df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])
        encoded = encoder.fit_transform(df[cat_cols])
    else:
        df[cat_cols] = imputer_cat.transform(df[cat_cols])
        encoded = encoder.transform(df[cat_cols])
    df[cat_cols] = pd.DataFrame(encoded, columns=cat_cols)
    # Scale numeric
    if is_train:
        df[ALLOWED_NUMERIC] = scaler.fit_transform(df[ALLOWED_NUMERIC])
    else:
        df[ALLOWED_NUMERIC] = scaler.transform(df[ALLOWED_NUMERIC])
    df = df.astype(np.float32)
    return df


Load data

In [5]:
dtypes = {col: "int32" for col in ALLOWED_CATEGORICAL}
dtypes.update({col: "float32" for col in ALLOWED_NUMERIC})
dtypes.update({col: "object" for col in ALLOWED_STRING})

df = pd.read_csv("/content/Dementia Prediction Dataset.csv", dtype=dtypes)
df = df.dropna(subset=[LABEL_COL])  # drop rows without label

imputer_num, imputer_cat, encoder, scaler = init_preprocessing_objects()
X = preprocess_df(df, imputer_num, imputer_cat, encoder, scaler, is_train=True)
y = df[LABEL_COL].values

  df = pd.read_csv("/content/Dementia Prediction Dataset.csv", dtype=dtypes)


Train/test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=42, stratify=y
)

Handle class imbalance with SMOTE

In [7]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

Train XGBoost

In [8]:
dtrain = xgb.DMatrix(X_train_res, label=y_train_res)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.03,             # slower learning rate
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "gamma": 1,               # minimum loss reduction to make a split
    "alpha": 1,               # L1 regularization
    "lambda": 1,              # L2 regularization
    "seed": 42
}

model = xgb.train(params, dtrain, num_boost_round=300)


Predict probabilities

In [9]:
y_pred_prob = model.predict(dtest)

Optimize threshold

In [10]:
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_prob)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

y_pred_opt = (y_pred_prob >= best_threshold).astype(int)

Evaluation

In [11]:
print("Best threshold for max F1:", best_threshold)
print("F1 at best threshold:", f1_scores[best_idx])
print("Recall at best threshold:", recalls[best_idx])
print("Precision at best threshold:", precisions[best_idx])
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_opt))
print("Classification Report:\n", classification_report(y_test, y_pred_opt))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))

Best threshold for max F1: 0.36719105
F1 at best threshold: 0.6361746313307775
Recall at best threshold: 0.7727272727272727
Precision at best threshold: 0.5406360424028268
Confusion Matrix:
 [[243 130]
 [ 45 153]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.65      0.74       373
         1.0       0.54      0.77      0.64       198

    accuracy                           0.69       571
   macro avg       0.69      0.71      0.69       571
weighted avg       0.74      0.69      0.70       571

ROC AUC: 0.7741422265550952


Output dementia risk %

In [12]:
results = pd.DataFrame({
    "Predicted_Probability": y_pred_prob,
    "Predicted_Risk_Percent": y_pred_prob * 100,
    "Binary_Prediction": y_pred_opt
})

print(results.head())

   Predicted_Probability  Predicted_Risk_Percent  Binary_Prediction
0               0.061861                6.186066                  0
1               0.407330               40.732964                  1
2               0.404099               40.409931                  1
3               0.034701                3.470117                  0
4               0.265847               26.584738                  0
