In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import joblib

df = pd.read_csv("../data/processed/feature_engineered_holdout.csv").fillna(0)

# 1. Clustering (Neighborhood Segmentation)
# We use Lat, Lon, and Price to find "Expensive Downtowns" vs "Cheap Suburbs"
if 'lat' in df.columns and 'lon' in df.columns:
    print("Training Clustering Model...")
    cluster_features = df[['lat', 'lon', 'price']].copy()
    
    # Scale data so Price (millions) doesn't dominate Lat (decimals)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(cluster_features)
    
    kmeans = KMeans(n_clusters=5, random_state=42)
    kmeans.fit(X_scaled)
    
    # Save both model and scaler (needed for inference)
    joblib.dump(kmeans, "../models/clustering.joblib")
    joblib.dump(scaler, "../models/cluster_scaler.joblib")
    print("✅ Clustering Model Saved")
else:
    print("⚠️ Skipped Clustering: Missing lat/lon columns")

# 2. Recommendation (Nearest Neighbors)
print("Training Recommender...")
# Use numeric features to find "similar" houses
feature_cols = [c for c in df.columns if c not in ["price", "date", "city_full", "id"]]
X_rec = df[feature_cols]

knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(X_rec)

joblib.dump(knn, "../models/recommender.joblib")
# We MUST save the training data too, so we can look up the neighbors later
joblib.dump(df, "../models/rec_data.joblib") 
print("✅ Recommender Model Saved")