In [14]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import cross_val_score


# Step 2: Load cleaned data
X = pd.read_csv("X_train_cleaned.csv")
y = pd.read_csv("y_train.csv").values.ravel()
X_test = pd.read_csv("X_test_cleaned.csv")

# Step 3: Define CV RMSE function
def rmse_cv(model):
    rmse = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    return rmse.mean()

# Step 4: Train XGBoost regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Step 5: Evaluate with CV
score = rmse_cv(xgb_model)
print(f"XGBoost CV RMSE: {score:.4f}")

# Step 6: Fit full data and predict
xgb_model.fit(X, y)

xgb_preds_log = xgb_model.predict(X_test)      
final_preds = np.expm1(xgb_preds_log)  

joblib.dump(xgb_model, 'xgboost_model.pkl') 

# Step 7: Save submission
submission = pd.DataFrame({
    "Id": np.arange(1461, 1461 + len(final_preds)),
    "SalePrice": final_preds
})
submission.to_csv("xgb_submission.csv", index=False)
print("Saved as xgb_submission.csv")

XGBoost CV RMSE: 0.1208
Saved as xgb_submission.csv
