In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
import os

# Create models directory if not exists
os.makedirs("../models", exist_ok=True)

# 1. Load Data
# Adjust path if necessary to where your cleaned CSV is located
df = pd.read_csv("../data/processed/feature_engineered_holdout.csv")
df = df.fillna(0) # Basic handling for NaNs

print(f"Data Loaded: {df.shape}")

# 2. Preparation
# Drop non-numeric columns for training (like date strings, city names if string)
feature_cols = [c for c in df.columns if c not in ["price", "date", "city_full", "id"]]
X = df[feature_cols]
y_reg = df["price"]

# 3. Train Regression (Exact Price)
print("Training Regression Model...")
reg_model = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
reg_model.fit(X, y_reg)
joblib.dump(reg_model, "../models/regression.joblib")
print("✅ Regression Model Saved")

# 4. Train Classification (Affordability Tier)
print("Training Classification Model...")
# Define logic: Low (<33%), Mid (33-66%), High (>66%)
q33 = df['price'].quantile(0.33)
q66 = df['price'].quantile(0.66)

def get_tier(price):
    if price < q33: return "Budget"
    elif price < q66: return "Standard"
    else: return "Premium"

y_class = df['price'].apply(get_tier)

class_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
class_model.fit(X, y_class)
joblib.dump(class_model, "../models/classification.joblib")
print("✅ Classification Model Saved")

Data Loaded: (149424, 49)
Training Regression Model...
✅ Regression Model Saved
Training Classification Model...
✅ Classification Model Saved
