# House Price Prediction

This notebook covers preprocessing, model training, evaluation, and inference.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings("ignore")


In [None]:

# Load dataset
df = pd.read_csv("data.csv")
df.head()


In [None]:

# Select numeric target
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
target_col = numeric_cols[-1]
target_col


In [None]:

# Handle missing values
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df[target_col].fillna(df[target_col].median(), inplace=True)

for col in df.select_dtypes(include=np.number):
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include="object"):
    df[col].fillna("Missing", inplace=True)
    df[col] = LabelEncoder().fit_transform(df[col])


In [None]:

# Log transform target
df[target_col] = np.log1p(df[target_col])


In [None]:

# Train-test split
X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Scale for Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = {}


In [None]:

for name, model in models.items():
    if name == "Linear Regression":
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    results[name] = rmse

    print(name, "RMSE:", rmse, "MAE:", mae)


In [None]:

best_model_name = min(results, key=results.get)
best_model = models[best_model_name]
best_model_name


In [None]:

if best_model_name == "Linear Regression":
    joblib.dump((best_model, scaler), "house_price_model.pkl")
else:
    joblib.dump(best_model, "house_price_model.pkl")


In [None]:

# Inference example
sample = X_test.iloc[[0]]
if best_model_name == "Linear Regression":
    model, sc = joblib.load("house_price_model.pkl")
    pred = model.predict(sc.transform(sample))
else:
    model = joblib.load("house_price_model.pkl")
    pred = model.predict(sample)

print("Predicted price:", np.expm1(pred[0]))
