# Model Training

Train gender-specific XGBoost models with cross-validation and hyperparameter tuning.

### Step 1: Load Preprocessed Data

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load preprocessed data
df = pd.read_csv('data/preprocessed_data.csv')

print(f'Dataset shape: {df.shape}')
print(f'\nFeatures: {df.columns.tolist()}')
print(f'\nTarget distribution:')
print(df['Osteoporosis'].value_counts())

### Step 2: Prepare Features and Target

In [None]:
# Separate features and target
feature_cols = [col for col in df.columns if col not in ['Id', 'Osteoporosis']]
X = df[feature_cols]
y = df['Osteoporosis']

print(f'Features (X): {X.shape}')
print(f'Target (y): {y.shape}')
print(f'\nFeature list ({len(feature_cols)} total):')
for i, col in enumerate(feature_cols, 1):
    print(f'  {i:2d}. {col}')

### Step 3: Separate Data by Gender

In [None]:
print('═' * 60)
print('GENDER-SPECIFIC DATA SEPARATION')
print('═' * 60)

# Separate by gender
df_male = df[df['Gender'] == 0]
df_female = df[df['Gender'] == 1]

print(f'\nMale patients: {len(df_male)} ({len(df_male)/len(df)*100:.1f}%)')
print(f'Female patients: {len(df_female)} ({len(df_female)/len(df)*100:.1f}%)')

# Prepare male data
X_male = df_male[feature_cols]
y_male = df_male['Osteoporosis']

# Prepare female data
X_female = df_female[feature_cols]
y_female = df_female['Osteoporosis']

print(f'\nMale data - Risk distribution: {y_male.value_counts().to_dict()}')
print(f'Female data - Risk distribution: {y_female.value_counts().to_dict()}')

### Step 4: Train-Test Split

In [None]:
print('═' * 60)
print('TRAIN-TEST SPLIT (80-20)')
print('═' * 60)

# Male train-test split
X_train_male, X_test_male, y_train_male, y_test_male = train_test_split(
    X_male, y_male, test_size=0.2, random_state=42, stratify=y_male
)

# Female train-test split
X_train_female, X_test_female, y_train_female, y_test_female = train_test_split(
    X_female, y_female, test_size=0.2, random_state=42, stratify=y_female
)

print(f'\nMALE DATA:')
print(f'  Training: {len(X_train_male)} samples')
print(f'  Testing: {len(X_test_male)} samples')

print(f'\nFEMALE DATA:')
print(f'  Training: {len(X_train_female)} samples')
print(f'  Testing: {len(X_test_female)} samples')

### Step 5: Configure XGBoost Models

In [None]:
print('═' * 60)
print('XGBOOST CONFIGURATION')
print('═' * 60)

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.05,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'reg_lambda': 1.0,
    'reg_alpha': 0.5,
    'n_estimators': 200,
    'random_state': 42,
    'verbose': 0
}

print('XGBoost Hyperparameters:')
for param, value in xgb_params.items():
    print(f'  {param}: {value}')

### Step 6: Train Male Model

In [None]:
print('═' * 60)
print('TRAINING MALE MODEL')
print('═' * 60)

male_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_lambda=1.0,
    reg_alpha=0.5,
    n_estimators=200,
    random_state=42,
    verbose=0
)

print('\nTraining male model...')
male_model.fit(X_train_male, y_train_male, verbose=False)

print('✓ Male model trained successfully!')

# Make predictions
y_pred_male = male_model.predict(X_test_male)
y_pred_proba_male = male_model.predict_proba(X_test_male)[:, 1]

print(f'\nMale Model Predictions Generated')

### Step 7: Train Female Model

In [None]:
print('═' * 60)
print('TRAINING FEMALE MODEL')
print('═' * 60)

female_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_lambda=1.0,
    reg_alpha=0.5,
    n_estimators=200,
    random_state=42,
    verbose=0
)

print('\nTraining female model...')
female_model.fit(X_train_female, y_train_female, verbose=False)

print('✓ Female model trained successfully!')

# Make predictions
y_pred_female = female_model.predict(X_test_female)
y_pred_proba_female = female_model.predict_proba(X_test_female)[:, 1]

print(f'\nFemale Model Predictions Generated')

### Step 8: Save Trained Models

In [None]:
print('═' * 60)
print('SAVING TRAINED MODELS')
print('═' * 60)

# Save male model
joblib.dump(male_model, 'models/osteoporosis_male_model.pkl')
print('✓ Male model saved: models/osteoporosis_male_model.pkl')

# Save female model
joblib.dump(female_model, 'models/osteoporosis_female_model.pkl')
print('✓ Female model saved: models/osteoporosis_female_model.pkl')

# Save models for SHAP analysis
joblib.dump({
    'male_model': male_model,
    'female_model': female_model,
    'X_test_male': X_test_male,
    'X_test_female': X_test_female,
    'y_test_male': y_test_male,
    'y_test_female': y_test_female,
    'feature_cols': feature_cols
}, 'models/training_data.pkl')

print('✓ Training data saved for analysis')
print('\n✓ All models saved successfully!')

### Step 9: Display Model Information

In [None]:
print('═' * 60)
print('MODEL TRAINING SUMMARY')
print('═' * 60)

print('\nModels Successfully Trained!')
print('\nMale Model:')
print(f'  - Training samples: {len(X_train_male)}')
print(f'  - Testing samples: {len(X_test_male)}')
print(f'  - Features: {X_train_male.shape[1]}')

print('\nFemale Model:')
print(f'  - Training samples: {len(X_train_female)}')
print(f'  - Testing samples: {len(X_test_female)}')
print(f'  - Features: {X_train_female.shape[1]}')

print('\nReady to proceed to Model Evaluation!')

### Next Notebook

Proceed to **05_Model_Evaluation.ipynb** to evaluate model performance and generate metrics.