In [18]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,make_scorer


df_train= pd.read_csv("train.csv")

y= df_train["SalePrice"]
x=df_train.drop(['SalePrice','Id'],axis=1)

y_logged= np.log1p(y)
x_train,x_test,y_train_log,y_test_log = train_test_split(x,y_logged,test_size=0.2,random_state=42)


In [19]:
numeric_features=x_train.select_dtypes(include=np.number).columns
categorical_features=x_train.select_dtypes(include=['object']).columns

numeric_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='constant',fill_value='None')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_pipeline,numeric_features),
        ('cat',categorical_pipeline,categorical_features),
    ],
    remainder='passthrough'
)


In [26]:
def log_rmse_scorer(y_true_log, y_pred_log):
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

linear_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())])

knn_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',KNeighborsRegressor())])


lin_scores=cross_val_score(linear_pipeline,x_train,y_train_log,
                           scoring=make_scorer(log_rmse_scorer,greater_is_better=False),cv=5,n_jobs=-1)
knn_scores=cross_val_score(knn_pipeline,x_train,y_train_log,
                           scoring=make_scorer(log_rmse_scorer,greater_is_better=False),cv=5,n_jobs=-1)


print(f"Linear Regression CV RMSLE: {-lin_scores.mean():.4f} (+/- {lin_scores.std():.4f})")
print(f"K-Neighbors CV RMSLE: {-knn_scores.mean():.4f} (+/- {knn_scores.std():.4f})")


Linear Regression CV RMSLE: 0.1600 (+/- 0.0118)
K-Neighbors CV RMSLE: 0.1739 (+/- 0.0208)


In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

lasso_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',Lasso(alpha=0.0005,max_iter=1000,random_state=42))
])
ridge_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',Ridge(alpha=10.0,random_state=42))
])

rf_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',RandomForestRegressor(n_estimators=100,random_state=42))
])

In [27]:
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
lasso_scores=cross_val_score(lasso_pipeline,x_train,y_train_log,
                            scoring=make_scorer(log_rmse_scorer,greater_is_better=False),cv=kfold,n_jobs=-1)
ridge_scores=cross_val_score(ridge_pipeline,x_train,y_train_log,
                             scoring=make_scorer(log_rmse_scorer,greater_is_better=False),cv=kfold,n_jobs=-1)
rf_scores=cross_val_score(rf_pipeline,x_train,y_train_log,
                            scoring=make_scorer(log_rmse_scorer,greater_is_better=False),cv=kfold,n_jobs=-1)
print(f"Lasso Regression CV RMSLE: {-lasso_scores.mean():.4f} (+/- {lasso_scores.std():.4f})")
print(f"Ridge Regression CV RMSLE: {-ridge_scores.mean():.4f} (+/- {ridge_scores.std():.4f})")
print(f"Random Forest CV RMSLE: {-rf_scores.mean():.4f} (+/- {rf_scores.std():.4f})")

Lasso Regression CV RMSLE: 0.1408 (+/- 0.0334)
Ridge Regression CV RMSLE: 0.1426 (+/- 0.0292)
Random Forest CV RMSLE: 0.1481 (+/- 0.0159)


In [28]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [10, 20, None],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, 
                           scoring=make_scorer(log_rmse_scorer, greater_is_better=False), 
                           cv=kfold, n_jobs=-1, verbose=1)

grid_search.fit(x_train, y_train_log)


print("--- Hyperparameter Tuning Complete ---")
print(f"Best Parameters Found: {grid_search.best_params_}")

best_tuned_model = grid_search.best_estimator_
y_pred_log = best_tuned_model.predict(x_test)
final_test_rmsle = log_rmse_scorer(y_test_log, y_pred_log)
print(f"Final Tuned Model RMSLE on Held-out Test Data: {final_test_rmsle:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
--- Hyperparameter Tuning Complete ---
Best Parameters Found: {'regressor__max_depth': 20, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Final Tuned Model RMSLE on Held-out Test Data: 0.1471
--- Hyperparameter Tuning Complete ---
Best Parameters Found: {'regressor__max_depth': 20, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 50}
Final Tuned Model RMSLE on Held-out Test Data: 0.1471
