In [None]:
#### Preamble ####
# Purpose: Train, evaluate, and save an XGBoost regression model to predict total income (`INCTOT`)
#          based on various features in the dataset. The notebook includes steps for feature 
#          engineering and model evaluation.
# Author: Jiazhou (Justin) Bi and Weiyang Li
# Date: 5 October 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
#
# Pre-requisites:
#  - Python 3.10.5 or above
#  - Required Python libraries:
#      - pandas
#      - sklearn
#      - xgboost
#      - matplotlib
#      - seaborn
#      - joblib
#  - A cleaned dataset (`cleaned_data.parquet`) must be available in the 
#    `../data/02-analysis_data/` directory.
#
# Additional Information:
#  - The notebook performs the following steps:
#      1. **Load the Cleaned Dataset**:
#          - Data is loaded from `../data/02-analysis_data/cleaned_data.parquet`.
#      2. **Feature Engineering**:
#          - Interaction term created between `EDUC_new` (education level) and `SEX` (gender).
#      3. **Data Preprocessing**:
#          - Categorical features are encoded using `OrdinalEncoder`.
#          - Numerical features are passed through without transformations.
#      4. **Model Training and Evaluation**:
#          - The XGBoost model is trained using a pipeline for preprocessing and regression.
#          - Metrics such as Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R-squared
#            are used for evaluation.
#      5. **Model Saving and Loading**:
#          - The trained pipeline is saved using `joblib` for future reuse.
#          - Instructions for loading the saved model and making predictions are included.
#
# Limitations:
#  - High RMSE values may indicate challenges in modeling due to data noise or unobserved variables.
#  - Assumes all necessary preprocessing steps have been applied in the cleaned dataset.
#
# Usage:
#  1. Run the notebook to train the XGBoost regression model.
#  2. Save the model using the provided code for future use.

In [2]:
# Load necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import joblib

# Load the cleaned dataset
df = pd.read_parquet('../data/02-analysis_data/cleaned_data.parquet')
df.head()

# Feature engineering: Interaction term
df['EDUC_SEX_INTERACTION'] = df['EDUC_new'] * df['SEX']

# Define features and target
X = df.drop(columns=['INCTOT'])
y = df['INCTOT']

# Define categorical and numerical features
categorical_features = [
    'MORTGAGE',
    'SEX',
    'MARST',
    'EDUC_new',
    'SCHLTYPE',
    'IND1990',
    'VETSTAT'
]
numerical_features = ['AGE']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=210)

# Preprocessor for handling categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Create the XGBoost pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        learning_rate=0.1, 
        max_depth=6, 
        min_child_weight=5, 
        n_estimators=300, 
        random_state=210
    ))
])

# Train the final model
xgb_pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = xgb_pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

# Save the trained model
joblib.dump(xgb_pipeline, '../models/final_xgb_model.pkl')
print("Model saved successfully!")

MSE: 5854629848.133301
RMSE: 76515.55298194806
Model saved successfully!


In [4]:
# Load the saved model
loaded_model = joblib.load('../models/final_xgb_model.pkl')

# Make predictions using the loaded model
y_pred_loaded = loaded_model.predict(X_test)

# Confirm the predictions match
print(f"Predictions using the loaded model: {y_pred_loaded[:5]}")


Predictions using the loaded model: [  8335.957  66796.97  101665.22   31009.086  73786.88 ]
