In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zafarali27/house-price-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/house-price-prediction-dataset


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel



def load_data(path):
    df = pd.read_csv(path + "/House Price Prediction Dataset.csv")
    return df

def preprocess_data(df):
    df.drop(columns=["Id"], inplace=True)

    locations = {"Downtown":0, "Urban":1, "Suburban":2, "Rural":3}
    conditions = {"Excellent":0, "Fair":1, "Good":2, "Poor":3}
    garage = {"No":0, "Yes":1}

    df["Location"] = df["Location"].map(locations)
    df["Condition"] = df["Condition"].map(conditions)
    df["Garage"] = df["Garage"].map(garage)

    return df

def feature_engineering(df):
    df['TotalArea'] = df['Area'] + df['Bedrooms']*200 + df['Bathrooms']*150
    df['RoomRatio'] = df['Bedrooms'] / df['Bathrooms']
    df['PricePerSqFt'] = df['Price'] / df['Area']

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.median(), inplace=True)

    return df

def prepare_data(df):
    X = df.drop(columns=["Price"])
    y = df["Price"]

    y = np.log1p(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

    return X_train, X_test, y_train, y_test

def build_models(X_train, X_test, y_train, y_test):
    models = {
        "Linear Regression": {
            "model": Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', LinearRegression())
            ]),
            "params": {
                'regressor__fit_intercept': [True, False],
                'regressor__positive': [True, False]
            }
        },
        "Ridge Regression": {
            "model": Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', Ridge())
            ]),
            "params": {
                'regressor__alpha': [0.1, 1.0, 10.0],
                'regressor__solver': ['auto', 'svd', 'cholesky']
            }
        },
        "Decision Tree": {
            "model": DecisionTreeRegressor(random_state=42),
            "params": {
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': [1.0, 'sqrt', 'log2']
            }
        },
        "Random Forest": {
            "model": RandomForestRegressor(random_state=42),
            "params": {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2],
                'max_features': [1.0, 'sqrt']
        }
            }}
    best_models = {}

    for name, config in models.items():

        grid = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=0
        )#In such case RandomizedSearchCV can be used since it will be faster and just as effective for large hyperparameter spaces

        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        best_models[name] = best_model

        y_pred = best_model.predict(X_test)

        y_test_exp = np.expm1(y_test)
        y_pred_exp = np.expm1(y_pred)

        mse = mean_squared_error(y_test_exp, y_pred_exp)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test_exp, y_pred_exp)

        cv_scores = cross_val_score(best_model, X_train, y_train,
                                   cv=5, scoring='neg_mean_squared_error')
        cv_rmse = np.mean(np.sqrt(-cv_scores))

        print(f"\nModel: {name}")
        print(f"\nBest parameters: {grid.best_params_}")
        print(f"Test RMSE: {rmse:.2f}")
        print(f"Test R2: {r2:.4f}")
        print(f"Cross-Validation RMSE: {cv_rmse:.2f}")

def main():
    df = load_data(path)

    df = preprocess_data(df)

    df = feature_engineering(df)

    X_train, X_test, y_train, y_test = prepare_data(df)

    best_models = build_models(X_train, X_test, y_train, y_test)



if __name__ == "__main__":
    main()


Model: Linear Regression

Best parameters: {'regressor__fit_intercept': True, 'regressor__positive': True}
Test RMSE: 238581.31
Test R2: 0.2684
Cross-Validation RMSE: 0.55

Model: Ridge Regression

Best parameters: {'regressor__alpha': 10.0, 'regressor__solver': 'auto'}
Test RMSE: 237543.88
Test R2: 0.2747
Cross-Validation RMSE: 0.56

Model: Decision Tree

Best parameters: {'max_depth': None, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test RMSE: 34127.46
Test R2: 0.9850
Cross-Validation RMSE: 0.09

Model: Random Forest

Best parameters: {'max_depth': None, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Test RMSE: 20009.00
Test R2: 0.9949
Cross-Validation RMSE: 0.06
