In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset and preprocess
df = pd.read_csv("data/processed/processed_v1.csv", low_memory=False)
for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna("Missing").astype(str)             # Fill NaNs with the string "Missing" (required for CatBoost)

X = df.drop('Sales Price', axis=1)
y = df['Sales Price']

# Store categorical features
cat_features = X.select_dtypes(include=['object']).columns.tolist()
print(f"Identified {len(cat_features)} categorical features.")

Identified 37 categorical features.


In [3]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42 
)

print("Data successfully loaded and split.")
print(f"Total shape:    {X.shape}")
print(f"Train shape:    {X_train.shape}")
print(f"Validate shape: {X_val.shape}")
print(f"Test shape:     {X_test.shape}")

Data successfully loaded and split.
Total shape:    (412698, 44)
Train shape:    (315713, 44)
Validate shape: (55715, 44)
Test shape:     (41270, 44)


In [None]:
# Create and train the model
model = CatBoostRegressor(
    iterations=3000,          
    learning_rate=0.03,
    depth=12,
    l2_leaf_reg=1.0,
    subsample=0.8, 
    colsample_bylevel=0.8,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    logging_level='Verbose',
    thread_count=-1,
    cat_features=cat_features
)

model.fit(
    X_train, 
    y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100, # Early stopping
    verbose=200                # Print progress every 200 iterations
)

print(f"Model trained. Best iteration: {model.get_best_iteration()}")
print("Evaluating on unseen Test data...")

final_log_preds = model.predict(X_test)
# Convert back to real currency
final_y_test = np.expm1(y_test)
final_preds = np.expm1(final_log_preds)

final_dollar_mae = mean_absolute_error(final_y_test, final_preds)
print(f"Final Test MAE (Unbiased): ${final_dollar_mae:,.2f}")

0:	learn: 0.5603666	test: 0.5577485	best: 0.5577485 (0)	total: 755ms	remaining: 37m 44s


In [None]:

# Get feature importances
importances = model.get_feature_importance()
feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)


plt.figure(figsize=(12, 10))
sns.barplot(
    x='Importance',
    y='Feature',
    data=importance_df.head(25) # Plot the top 25 features
)
plt.title('Top 25 Feature Importances (CatBoost)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()

plt.savefig('catboost_feature_importance.png')
print("Successfully saved 'catboost_feature_importance.png'")

CatBoostError: Model has no meta information needed to calculate feature importances.                             Pass training dataset to this function.