In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

In [22]:
train_df = pd.read_parquet('final_data_train.parquet')
test_df = pd.read_parquet('final_data_test.parquet')

In [23]:
print(f"Training data size: {train_df.shape[0]}")
print(f"Testing data size: {test_df.shape[0]}")

Training data size: 209396
Testing data size: 52350


In [24]:
# Define Features (X) and Target (y)
X_train = train_df.drop(columns=['Log_TotalExpense'])
y_train = train_df['Log_TotalExpense']

X_test = test_df.drop(columns=['Log_TotalExpense'])
y_test = test_df['Log_TotalExpense']

In [25]:
# Ensure all features are numeric
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    X_train[col] = X_train[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# Train XGBoost Regressor
model = XGBRegressor(n_estimators=500, max_depth=15, learning_rate=0.001, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Compute R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.3979


In [19]:
# Calculate MAPE
mape = np.mean(np.abs((np.expm1(y_test) - np.expm1(y_pred)) / np.expm1(y_test))) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}%")

Mean Absolute Percentage Error (MAPE): 25.7585%
