In [456]:
import pandas as pd
import numpy as np
from datakit import *
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

__Read data__

In [525]:
path = f'data_processed/modeling_data.csv'
df = pd.read_csv(path)

In [527]:
columns_to_drop = ['latitude', 'longitude']

modeling_df = df.copy()
modeling_df = modeling_df.drop(columns=columns_to_drop)

input_missing_values(modeling_df)

df_encoded = pd.get_dummies(
    modeling_df,
    columns=['heating', 'building_type', 'apartment_class'],
    prefix={'heating': 'heating', 'building_type': 'bt', 'apartment_class': 'ac'},
    prefix_sep='_',
    dtype=int)

district_dummies = pd.get_dummies(df['district'], prefix='', prefix_sep='', dtype=int)
modeling_df = pd.concat([df_encoded.drop('district', axis=1), district_dummies], axis=1)

modeling_df['rent'] = modeling_df['rent'] + modeling_df['additional_fees']

train_df = modeling_df[modeling_df.added_dt.le('2025-01-25')].copy()
out_of_time_sample = modeling_df[modeling_df.added_dt.ge('2025-01-26')&modeling_df.added_dt.le('2025-02-04')].copy()

train_df = train_df.drop(["added_dt", 'additional_fees'], axis=1)
out_of_time_sample = out_of_time_sample.drop(["added_dt", 'additional_fees'], axis=1)

In [529]:
y_col = 'rent'
x_cols = [col for col in train_df.columns if col != y_col]

X_train = train_df.drop([y_col], axis=1)
y_train = train_df[y_col]
X_out_of_time = out_of_time_sample.drop([y_col], axis=1)
y_out_of_time = out_of_time_sample[y_col]

In [531]:
params = {
    "learning_rate": 0.05,
    "max_depth": 3,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 1,
    "n_estimators": 150
}

model = XGBRegressor(**params)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_validate(
    model, X_train, y_train, cv=cv, scoring=["neg_mean_absolute_error", "r2", "neg_mean_absolute_percentage_error"], return_train_score=True
)

print("MAE Train:", cv_results["train_neg_mean_absolute_error"])
print("MAE Test:", cv_results["test_neg_mean_absolute_error"])
print("Mean MAE Test:", cv_results["test_neg_mean_absolute_error"].mean())
print("-------------------")
print("R2 Train:", cv_results["train_r2"])
print("R2 Test:", cv_results["test_r2"])
print("Mean R2 Test:", cv_results["test_r2"].mean())
print("-------------------")
print("MAPE Train:", cv_results["train_neg_mean_absolute_percentage_error"])
print("MAPE Test:", cv_results["test_neg_mean_absolute_percentage_error"])
print("Mean MAPE Test:", cv_results["test_neg_mean_absolute_percentage_error"].mean())

MAE Train: [-512.66680418 -511.74585426 -524.3884941  -515.10513399 -514.00352169]
MAE Test: [-597.57408586 -601.48777099 -557.98270693 -590.63736765 -621.90763098]
Mean MAE Test: -593.917912482687
-------------------
R2 Train: [0.93985089 0.94092595 0.94101321 0.94014286 0.93441413]
R2 Test: [0.90845257 0.87831325 0.88615913 0.90301722 0.91610054]
Mean R2 Test: 0.8984085405327547
-------------------
MAPE Train: [-0.09903068 -0.10006266 -0.10174366 -0.09911698 -0.10006158]
MAPE Test: [-0.11329038 -0.10491154 -0.10683667 -0.11369132 -0.10999497]
Mean MAPE Test: -0.10974497449604972


In [533]:
# Initialize and train the model on the full training set
model = XGBRegressor(**params)
model.fit(X_train, y_train)

# Predict on the out‐of‐time sample
y_pred_oot = model.predict(X_train)

# Calculate scoring metrics for the OOT sample
mae_oot = mean_absolute_error(y_train, y_pred_oot)
r2_oot = r2_score(y_train, y_pred_oot)
mape_oot = mean_absolute_percentage_error(y_train, y_pred_oot)

# Print out the results
print("Out-of-Time Sample Scores:")
print("MAE:", mae_oot)
print("R2:", r2_oot)
print("MAPE:", mape_oot)

Out-of-Time Sample Scores:
MAE: 524.5507010420273
R2: 0.9360532363916927
MAPE: 0.10104372890298687


In [535]:
# Initialize and train the model on the full training set
model = XGBRegressor(**params)
model.fit(X_train, y_train)

# Predict on the out‐of‐time sample
y_pred_oot = model.predict(X_out_of_time)

# Calculate scoring metrics for the OOT sample
mae_oot = mean_absolute_error(y_out_of_time, y_pred_oot)
r2_oot = r2_score(y_out_of_time, y_pred_oot)
mape_oot = mean_absolute_percentage_error(y_out_of_time, y_pred_oot)

# Print out the results
print("Out-of-Time Sample Scores:")
print("MAE:", mae_oot)
print("R2:", r2_oot)
print("MAPE:", mape_oot)

Out-of-Time Sample Scores:
MAE: 549.664068760016
R2: 0.8934478505480276
MAPE: 0.11512829184965051
