# 🏠 House Price Regression Project
This notebook follows the full rubric for Project 3: Predicting House Prices using various regression models.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# 📂 Load Dataset
df = pd.read_csv('train.csv')
df.head()

## 📊 Data Understanding

In [None]:
df.info()
df.describe()
df.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()

## 🧼 Preprocessing for Experiment 1

In [None]:
# Select top numerical features
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
X = df[features]
y = df['SalePrice']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 🤖 Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
print(f"Linear Regression RMSE: {rmse_lr:.2f}")

## 🧪 Ridge Regression with Categorical Features

In [None]:
features = ['OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'BldgType', 'YearBuilt']
X = df[features]
y = df['SalePrice']

categorical = ['Neighborhood', 'BldgType']
numerical = ['OverallQual', 'GrLivArea', 'GarageCars', 'YearBuilt']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

ridge_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('ridge', Ridge(alpha=1.0))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)
print(f"Ridge Regression RMSE: {rmse_ridge:.2f}")

## 🧪 Lasso Regression with Feature Engineering

In [None]:
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
features = ['TotalSF', 'OverallQual', 'Neighborhood', 'YearBuilt']

categorical = ['Neighborhood']
numerical = ['TotalSF', 'OverallQual', 'YearBuilt']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

lasso_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('lasso', Lasso(alpha=0.1))
])

X = df[features]
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso_pipeline.fit(X_train, y_train)
y_pred_lasso = lasso_pipeline.predict(X_test)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
print(f"Lasso Regression RMSE: {rmse_lasso:.2f}")

## 📊 Model Comparison

In [None]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'RMSE': [rmse_lr, rmse_ridge, rmse_lasso]
})
results