# 04 - Model Training (Gender-Specific XGBoost)
## Osteoporosis Risk Prediction Model
**DSGP Group 40** | Student: Isum Gamage (ID: 20242052)

This notebook trains gender-specific XGBoost models for male and female cohorts.

## Step 1: Install and Import Required Libraries

In [None]:
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn shap joblib --upgrade

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ All libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")
print(f"Pandas version: {pd.__version__}")

## Step 2: Load Preprocessed Data
Load the dataset that was prepared and encoded in previous notebooks.

In [None]:
from google.colab import files
print("Upload your preprocessed dataset (osteoporosis_cleaned_reorganized.csv):")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"\\n✓ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\\nColumn names:\\n{df.columns.tolist()}")
print(f"\\nTarget variable distribution:")
print(df['Osteoporosis'].value_counts())

## Step 3: Data Preprocessing (Feature Engineering)
Apply encoding, scaling, and feature engineering as documented.

In [None]:
df_processed = df.copy()

print("Step 3.1: Handling Missing Values")
print("-" * 60)

if 'Alcohol Consumption' in df_processed.columns:
    df_processed['Alcohol Consumption'].fillna('None', inplace=True)
    print(f"✓ Alcohol Consumption: filled missing values")

if 'Medical Conditions' in df_processed.columns:
    df_processed['Medical Conditions'].fillna('None', inplace=True)
    print(f"✓ Medical Conditions: filled missing values")

if 'Medications' in df_processed.columns:
    df_processed['Medications'].fillna('None', inplace=True)
    print(f"✓ Medications: filled missing values")

print(f"✓ Remaining missing values: {df_processed.isnull().sum().sum()}")

print("\\nStep 3.2: Binary Feature Encoding")
print("-" * 60)

binary_encoding = {'Gender': {'Male': 0, 'Female': 1}, 'Hormonal Changes': {'Normal': 0, 'Post-menopausal': 1}, 'Body Weight': {'Normal': 0, 'Underweight': 1}, 'Calcium Intake': {'Adequate': 0, 'Low': 1}, 'Vitamin D': {'Sufficient': 0, 'Insufficient': 1}, 'Physical Activity': {'Active': 0, 'Sedentary': 1}, 'Smoking': {'No': 0, 'Yes': 1}, 'Prior Fractures': {'No': 0, 'Yes': 1}, 'Family History': {'No': 0, 'Yes': 1}}

for col, mapping in binary_encoding.items():
    if col in df_processed.columns:
        df_processed[col] = df_processed[col].map(mapping)
        print(f"✓ {col}: encoded")

print("\\nStep 3.3: Multi-Category Feature Encoding")
print("-" * 60)

categorical_cols = ['Race/Ethnicity', 'Alcohol Consumption', 'Medical Conditions', 'Medications']

for col in categorical_cols:
    if col in df_processed.columns:
        encoded = pd.get_dummies(df_processed[col], prefix=col, drop_first=False)
        df_processed = pd.concat([df_processed, encoded], axis=1)
        df_processed.drop(col, axis=1, inplace=True)
        print(f"✓ {col}: one-hot encoded")

print("\\nStep 3.4: Feature Scaling")
print("-" * 60)

scaler = StandardScaler()
if 'Age' in df_processed.columns:
    df_processed['Age'] = scaler.fit_transform(df_processed[['Age']])
    print(f"✓ Age: standardized")

print(f"✓ Feature engineering complete!\nFinal dataset shape: {df_processed.shape}")

## Step 4: Separate Data by Gender

In [None]:
print("Step 4: Gender-Specific Data Separation")
print("=" * 60)

X = df_processed.drop('Osteoporosis', axis=1)
y = df_processed['Osteoporosis']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

male_mask = X['Gender'] == 0
female_mask = X['Gender'] == 1

X_male = X[male_mask].copy()
y_male = y[male_mask].copy()

X_female = X[female_mask].copy()
y_female = y[female_mask].copy()

print(f"\\n✓ Male cohort: {X_male.shape[0]} samples")
print(f"  Risk distribution: {y_male.value_counts().to_dict()}")

print(f"✓ Female cohort: {X_female.shape[0]} samples")
print(f"  Risk distribution: {y_female.value_counts().to_dict()}")

## Step 5: Train-Test Split (80-20 Stratified)

In [None]:
print("Step 5: Train-Test Split (80-20 Stratified)")
print("=" * 60)

X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(X_male, y_male, test_size=0.2, stratify=y_male, random_state=42)
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(X_female, y_female, test_size=0.2, stratify=y_female, random_state=42)

print(f"\\nMALE MODEL:")
print(f"  Train: {X_train_male.shape[0]} samples")
print(f"  Test: {X_test_male.shape[0]} samples")

print(f"\\nFEMALE MODEL:")
print(f"  Train: {X_train_female.shape[0]} samples")
print(f"  Test: {X_test_female.shape[0]} samples")

print(f"✓ Data split complete!")

## Step 6: XGBoost Hyperparameters Configuration

In [None]:
print("Step 6: XGBoost Configuration")
print("=" * 60)

xgb_params = {'objective': 'binary:logistic', 'max_depth': 6, 'learning_rate': 0.05, 'n_estimators': 150, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'gamma': 0.1, 'random_state': 42, 'verbosity': 0, 'eval_metric': 'logloss'}

print("XGBoost Parameters:")
for key, value in xgb_params.items():
    print(f"  {key}: {value}")

## Step 7: Train Male XGBoost Model

In [None]:
print("\\nStep 7: Training Male XGBoost Model")
print("=" * 60)
print(f"Training samples: {X_train_male.shape[0]}
")

male_model = xgb.XGBClassifier(**xgb_params, enable_categorical=True)
male_model.fit(X_train_male, y_train_male, eval_set=[(X_train_male, y_train_male), (X_test_male, y_test_male)], verbose=False)

y_pred_male = male_model.predict(X_test_male)
y_pred_proba_male = male_model.predict_proba(X_test_male)[:, 1]

print(f"✓ Male model training complete!")
print(f"Predictions generated for {len(y_pred_male)} male test samples")

## Step 8: Train Female XGBoost Model

In [None]:
print("\\nStep 8: Training Female XGBoost Model")
print("=" * 60)
print(f"Training samples: {X_train_female.shape[0]}
")

female_model = xgb.XGBClassifier(**xgb_params, enable_categorical=True)
female_model.fit(X_train_female, y_train_female, eval_set=[(X_train_female, y_train_female), (X_test_female, y_test_female)], verbose=False)

y_pred_female = female_model.predict(X_test_female)
y_pred_proba_female = female_model.predict_proba(X_test_female)[:, 1]

print(f"✓ Female model training complete!")
print(f"Predictions generated for {len(y_pred_female)} female test samples")

## Step 9: Save Models

In [None]:
print("\\nStep 9: Model Serialization")
print("=" * 60)

joblib.dump(male_model, 'osteoporosis_male_model.pkl')
joblib.dump(female_model, 'osteoporosis_female_model.pkl')
joblib.dump(scaler, 'age_scaler.pkl')

print("✓ Male model saved")
print("✓ Female model saved")
print("✓ Scaler saved")
print("✓ Models ready for deployment!")

## Summary
✅ Model Training Complete!
- ✓ Data preprocessed
- ✓ Gender-specific separation
- ✓ Male XGBoost model trained
- ✓ Female XGBoost model trained
- ✓ Models serialized and ready
**Next Steps:** Run 05_Model_Evaluation.ipynb