In [14]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Step 2: Read processed data
X = pd.read_csv("X_train_cleaned.csv")
y = pd.read_csv("y_train.csv").values.ravel()  # flatten to 1D
X_test = pd.read_csv("X_test_cleaned.csv")

# Step 3: Define a RMSE cross-validation function
def rmse_cv(model):
    rmse = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    return rmse.mean()

# Step 4: Train Ridge baseline model
ridge = Ridge(alpha=10)
score = rmse_cv(ridge)
print(f"Ridge CV RMSE: {score:.4f}")

# Step 5: Fit and predict
ridge.fit(X, y)
ridge_preds = ridge.predict(X_test)

# Step 6: Convert predictions back from log1p if needed
# (Only do this if your y_train is in log1p form!)
final_preds = np.expm1(ridge_preds)  # if y = log1p(SalePrice)

# Step 7: Generate submission file
submission = pd.DataFrame({
    "Id": np.arange(1461, 1461 + len(final_preds)),  # test.csv starts from 1461
    "SalePrice": final_preds
})
submission.to_csv("ridge_submission.csv", index=False)
print("Submission file saved as ridge_submission.csv")


Ridge CV RMSE: 0.1273
Submission file saved as ridge_submission.csv
