# Model Definition and Evaluation

## Table of Contents

1. [Model Selection](#model-selection)
2. [Feature Engineering](#feature-engineering)
3. [Hyperparameter Tuning](#hyperparameter-tuning)
4. [Implementation](#implementation)
5. [Evaluation Metrics](#evaluation-metrics)
6. [Comparative Analysis](#comparative-analysis)


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.dummy import DummyClassifier

import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings
import joblib
import shap # Import SHAP library
import matplotlib.pyplot as plt # For plotting SHAP results
import seaborn as sns # For nicer plots

## Model Selection

[Discuss the type(s) of models you consider for this task, and justify the selection.]


## Feature Engineering

[Describe any additional feature engineering you've performed beyond what was done for the baseline model.]


In [12]:
df = pd.read_csv('../1_DatasetCharacteristics/share_w6_retirees_cleaned.csv')
df.head()


Unnamed: 0,country,language,ac011__1,ac012__1,ac014__1,ac015__1,ac016__1,ac017__1,ac018__1,ac019__1,...,ep064d1_7_w9,ep064d2_7_w9,ep064d3_7_w9,ep064d4_7_w9,ep064d5_7_w9,ep064d6_7_w9,ep064d7_7_w9,ep064d8_7_w9,ep064d9_7_w9,ep064d10_7_w9
0,11,11,1.0,8.0,3.0,3.0,4.0,1.0,2.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11,11,1.0,9.0,2.0,3.0,4.0,2.0,2.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11,11,1.0,9.0,4.0,4.0,4.0,1.0,3.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11,11,1.0,10.0,4.0,4.0,4.0,1.0,3.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11,11,1.0,10.0,4.0,3.0,4.0,1.0,4.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
X = df.drop('ep036__7', axis=1)
y = df['ep036__7']

In [23]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Train-Test Split ---
print(f"\nTrain set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")



Train set shape: (1105, 122)
Test set shape: (277, 122)


In [21]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols)
    ],
    remainder='passthrough'
)

In [29]:

models_and_params = {
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False,
                                   objective='binary:logistic'),
        'params': {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
            'classifier__max_depth': [3, 5, 7, 9],
            'classifier__subsample': [0.6, 0.8, 1.0],
            'classifier__colsample_bytree': [0.6, 0.8, 1.0],
            'classifier__gamma': [0, 0.1, 0.2],
            'classifier__reg_lambda': [1, 1.5, 2],
            'classifier__reg_alpha': [0, 0.1, 0.2]
        }
    },
    'LightGBM': {
        'model': lgb.LGBMClassifier(random_state=42, objective='binary'),
        'params': {
            'classifier__n_estimators': [100, 300, 500],
            'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
            'classifier__num_leaves': [20, 31, 40],
            'classifier__max_depth': [-1, 5, 10],
            'classifier__min_child_samples': [20, 30, 40],
            'classifier__subsample': [0.7, 0.8, 1.0],
            'classifier__colsample_bytree': [0.7, 0.8, 1.0],
            'classifier__reg_alpha': [0, 0.1, 0.5],
            'classifier__reg_lambda': [0, 0.1, 0.5]
        }
    },
    'CatBoost': {
        'model': cb.CatBoostClassifier(random_state=42, verbose=0, eval_metric='Logloss',  # CatBoost can handle categorical features directly, but here they are OHE
                                      
                                       
                                       ),
        'params': {
            'classifier__iterations': [100, 300, 500],
            'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
            'classifier__depth': [4, 6, 8],
            'classifier__l2_leaf_reg': [1, 3, 5],
            'classifier__border_count': [32, 128], # Reduced for faster run
            'classifier__loss_function': ['Logloss'],
        } 
    },
    'MLP Classifier': {
        'model': MLPClassifier(random_state=42, max_iter=2000),
        'params': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)], # Reduced complexity for faster runs
            'classifier__activation': ['relu', 'tanh'],
            'classifier__alpha': [0.0001, 0.001, 0.01],
            'classifier__learning_rate_init': [0.001, 0.01],
            'classifier__solver': ['adam'] # Reduced for faster runs
        }
    },
    'Baseline (Stratified Dummy)': {
        'model': DummyClassifier(strategy='stratified', random_state=42),
        'params': {}
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, solver='liblinear', max_iter=1000),
        'params': {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'classifier__penalty': ['l1', 'l2']
        }
    },
    'Support Vector Machine': {
        'model': SVC(random_state=42, probability=True, cache_size=2000),
        'params': {
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__kernel': ['linear', 'rbf'], # Removed 'poly' as it greatly increases computation time for small gain
            'classifier__gamma': ['scale', 'auto']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [100, 250, 500],
            'classifier__max_depth': [5, 10, 15, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_features': ['sqrt', 'log2', 0.8]
        }
    }
}

In [27]:
cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Reduced splits for faster execution

# --- 7. Run Experiments ---
results = {}
test_roc_aucs = {}


In [None]:

print("\n--- Starting Model Experiments ---")

for model_name, config in models_and_params.items():
    print(f"\nTraining and tuning: {model_name}")

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', config['model'])
    ])

    grid_search = GridSearchCV(
        pipeline,
        param_grid=config['params'],
        cv=cv_strategy,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=0 # Set to 0 for less verbose output during grid search
    )
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.best_estimator_.predict(X_test)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:, 1]

    test_roc_auc = roc_auc_score(y_test, y_proba)
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    test_pr_auc = auc(recall, precision)

    results[model_name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_cv_score': grid_search.best_score_,
        'test_roc_auc': test_roc_auc,
        'test_pr_auc': test_pr_auc,
        'classification_report': classification_report(y_test, y_pred, target_names=['Not Retire', 'Retire'], output_dict=True)
    }
    test_roc_aucs[model_name] = test_roc_auc

    print(f"Finished {model_name}:")
    print(f"  Best CV ROC AUC: {results[model_name]['best_cv_score']:.4f}")
    print(f"  Best Parameters: {results[model_name]['best_params']}")
    print(f"\n--- Test Set Evaluation for {model_name} (Best Estimator) ---")
    print(f"  Test ROC AUC: {test_roc_auc:.4f}")
    print(f"  Test PR AUC: {test_pr_auc:.4f}")
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, y_pred, target_names=['Not Retire', 'Retire']))
    print("-" * 50)


--- Starting Model Experiments ---

Training and tuning: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


## Hyperparameter Tuning

[Discuss any hyperparameter tuning methods you've applied, such as Grid Search or Random Search, and the rationale behind them.]


In [16]:
# Implement hyperparameter tuning
# Example using GridSearchCV with a DecisionTreeClassifier
# param_grid = {'max_depth': [2, 4, 6, 8]}
# grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
# grid_search.fit(X_train, y_train)


## Implementation

[Implement the final model(s) you've selected based on the above steps.]


In [17]:
# Implement the final model(s)
# Example: model = YourChosenModel(best_hyperparameters)
# model.fit(X_train, y_train)


## Evaluation Metrics

[Clearly specify which metrics you'll use to evaluate the model performance, and why you've chosen these metrics.]


In [18]:
# Evaluate the model using your chosen metrics
# Example for classification
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))

# Example for regression
# mse = mean_squared_error(y_test, y_pred)

# Your evaluation code here


## Comparative Analysis

[Compare the performance of your model(s) against the baseline model. Discuss any improvements or setbacks and the reasons behind them.]


In [19]:
# Comparative Analysis code (if applicable)
# Example: comparing accuracy of the baseline model and the new model
# print(f"Baseline Model Accuracy: {baseline_accuracy}, New Model Accuracy: {new_model_accuracy}")
