# 🔬 Model Comparison & Submission – House Prices

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb

# Load cleaned training and test data
train = pd.read_csv('../data/train_clean.csv')
test = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [None]:
# Custom preprocessing
from src.preprocess import preprocess

# Preprocess training data
train, label_encoders = preprocess(train, is_train=True)

# Preprocess test data using train encoders
test = preprocess(test, is_train=False, label_encoders=label_encoders)

In [None]:
# Prepare train and test datasets
X = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']

# Match test columns to training (drop SalePrice if exists)
X_test = test[X.columns] if 'SalePrice' not in test.columns else test.drop(['SalePrice'], axis=1)

## 📊 Cross-Validation Comparison

In [None]:
# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse.mean()

models = {
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "LightGBM": lgb.LGBMRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42)
}

results = {}
for name, model in models.items():
    score = rmse_cv(model)
    results[name] = score
    print(f"{name}: RMSE = {score:.2f}")

## 🏁 Train Final Model & Predict

## 📈 Model Performance Comparison

In [None]:
# Visualize model RMSE scores
import matplotlib.pyplot as plt

model_names = list(results.keys())
rmse_scores = list(results.values())

plt.figure(figsize=(8, 5))
bars = plt.bar(model_names, rmse_scores, color='skyblue')
plt.title('Model Comparison (Lower RMSE is Better)')
plt.ylabel('RMSE')
plt.grid(axis='y')

# Annotate bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 500, f'{yval:.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Choose the best model (LightGBM in this case, can be changed)
final_model = lgb.LGBMRegressor(random_state=42)
final_model.fit(X, y)
preds = final_model.predict(X_test)

## 💾 Save Submission

In [None]:
# Prepare submission file
submission = sample_submission.copy()
submission['SalePrice'] = preds
submission.to_csv('../data/submission.csv', index=False)
print('✅ Submission saved to ../data/submission.csv')