In [1]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load the dataset and preprocess
df = pd.read_csv("data/processed/processed_v1.csv", low_memory=False)
for col in df.columns:
        if df[col].dtype == 'object':
            # 1. Fill NaNs with the string "Missing"
            # 2. Convert the *entire* column to string dtype
            df[col] = df[col].fillna("Missing").astype(str) # <-- THE FIX

X = df.drop('Sales Price', axis=1)
y = df['Sales Price']

# Now, we find 'object' dtypes (which are our strings)
cat_features = X.select_dtypes(include=['object']).columns.tolist()
print(f"Identified {len(cat_features)} categorical features.")

Identified 37 categorical features.


In [7]:
# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# Now, split (Train + Val) into Train and Val (e.g., 75% / 25% of the 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15, random_state=42 
)

print("Data successfully loaded and split.")
print(f"Total shape:    {X.shape}")
print(f"Train shape:    {X_train.shape}")
print(f"Validate shape: {X_val.shape}")
print(f"Test shape:     {X_test.shape}")

Data successfully loaded and split.
Total shape:    (412698, 44)
Train shape:    (315713, 44)
Validate shape: (55715, 44)
Test shape:     (41270, 44)


In [None]:

# Create the model
model = CatBoostRegressor(
    iterations=4000,          # High number, will be stopped early
    learning_rate=0.05,
    depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    random_seed=42,
    logging_level='Verbose',
    thread_count=-1,
    cat_features=cat_features # Tell CatBoost about categories
)

# Train on the training set, validate on the validation set
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=100, # Stop if val_MAE doesn't improve
    verbose=200                # Print progress every 200 iterations
)

print(f"Model trained. Best iteration: {model.get_best_iteration()}")
print("Evaluating on unseen Test data...")

# Score on the unseen test set
final_log_preds = model.predict(X_test)

# Convert back to real dollars
final_y_test = np.expm1(y_test)
final_preds = np.expm1(final_log_preds)

final_dollar_mae = mean_absolute_error(final_y_test, final_preds)
print(f"Final Test MAE (Unbiased): ${final_dollar_mae:,.2f}")

0:	learn: 0.5519436	test: 0.5493496	best: 0.5493496 (0)	total: 656ms	remaining: 43m 42s
200:	learn: 0.1746493	test: 0.1739192	best: 0.1739192 (200)	total: 2m 1s	remaining: 38m 11s
400:	learn: 0.1606432	test: 0.1627949	best: 0.1627949 (400)	total: 4m 47s	remaining: 43m 3s


In [None]:

# Get feature importances
importances = model.get_feature_importance()
feature_names = X_train.columns

# Create a DataFrame for easy sorting and plotting
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance (most important first)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# --- Plot the Top 25 Most Important Features ---
plt.figure(figsize=(12, 10))
sns.barplot(
    x='Importance',
    y='Feature',
    data=importance_df.head(25) # Plot the top 25
)
plt.title('Top 25 Feature Importances (CatBoost)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout() # Adjust layout to prevent label overlap

plt.savefig('catboost_feature_importance.png')
print("Successfully saved 'catboost_feature_importance.png'")