<a href="https://colab.research.google.com/github/Jainrani/House-Price-Prediction/blob/main/House_Price_Prediction_%E2%80%93_Complete_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================================
# STEP 1: Import Libraries
# ================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ================================
# STEP 2: Load Dataset
# ================================
data = pd.read_csv("house_data.csv")
print("Dataset Shape:", data.shape)
print(data.head())

# ================================
# STEP 3: Exploratory Data Analysis (EDA)
# ================================
plt.figure(figsize=(6,4))
plt.scatter(data['area'], data['price'])
plt.xlabel('Area (sq ft)')
plt.ylabel('Price')
plt.title('Area vs Price')
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(data['bedrooms'], data['price'])
plt.xlabel('Bedrooms')
plt.ylabel('Price')
plt.title('Bedrooms vs Price')
plt.show()

plt.figure(figsize=(6,4))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# ================================
# STEP 4: Feature & Target Split
# ================================
X = data.drop("price", axis=1)
y = data["price"]

# Standardize features for Polynomial Regression
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ================================
# STEP 5: Split Train/Test
# ================================
X_train_lr, X_test_lr, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_scaled, X_test_scaled, _, _ = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ================================
# STEP 6: Linear Regression
# ================================
lr_model = LinearRegression()
lr_model.fit(X_train_lr, y_train)
y_pred_lr = lr_model.predict(X_test_lr)

print("\n--- Linear Regression ---")
print("MAE:", mean_absolute_error(y_test, y_pred_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("R2 Score:", r2_score(y_test, y_pred_lr))

# Feature Importance
lr_coef = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lr_model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print("Feature Importance:\n", lr_coef)

# Plot Actual vs Predicted
plt.figure(figsize=(6,4))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual')
plt.scatter(range(len(y_pred_lr)), y_pred_lr, color='red', label='Predicted', alpha=0.7)
plt.title('Linear Regression: Actual vs Predicted')
plt.xlabel('Sample Index')
plt.ylabel('Price')
plt.legend()
plt.show()

# ================================
# STEP 7: Random Forest Regressor
# ================================
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_lr, y_train)
y_pred_rf = rf_model.predict(X_test_lr)

print("\n--- Random Forest Regressor ---")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("R2 Score:", r2_score(y_test, y_pred_rf))

# Feature Importance
rf_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)
print("Feature Importance:\n", rf_importance)

plt.figure(figsize=(6,4))
plt.bar(rf_importance["Feature"], rf_importance["Importance"])
plt.title("Random Forest Feature Importance")
plt.show()

# ================================
# STEP 8: Polynomial Regression (degree=2)
# ================================
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_scaled)
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(
    X_poly, y, test_size=0.2, random_state=42
)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train_poly)
y_pred_poly = poly_model.predict(X_test_poly)

print("\n--- Polynomial Regression ---")
print("MAE:", mean_absolute_error(y_test_poly, y_pred_poly))
print("MSE:", mean_squared_error(y_test_poly, y_pred_poly))
print("RMSE:", np.sqrt(mean_squared_error(y_test_poly, y_pred_poly)))
print("R2 Score:", r2_score(y_test_poly, y_pred_poly))

# Feature Importance (Approximate)
poly_coef = pd.DataFrame({
    "Feature": poly.get_feature_names_out(X.columns),
    "Coefficient": poly_model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print("Top 10 Feature Importance:\n", poly_coef.head(10))

# Plot Actual vs Predicted
plt.figure(figsize=(6,4))
plt.scatter(range(len(y_test_poly)), y_test_poly, color='blue', label='Actual')
plt.scatter(range(len(y_pred_poly)), y_pred_poly, color='red', label='Predicted', alpha=0.7)
plt.title('Polynomial Regression: Actual vs Predicted')
plt.xlabel('Sample Index')
plt.ylabel('Price')
plt.legend()
plt.show()

# ================================
# STEP 9: Summary Table for Interview
# ================================
summary = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "Polynomial Regression"],
    "R2 Score": [r2_score(y_test, y_pred_lr), r2_score(y_test, y_pred_rf), r2_score(y_test_poly, y_pred_poly)],
    "RMSE": [np.sqrt(mean_squared_error(y_test, y_pred_lr)),
             np.sqrt(mean_squared_error(y_test, y_pred_rf)),
             np.sqrt(mean_squared_error(y_test_poly, y_pred_poly))]
})
print("\n--- Model Comparison ---")
print(summary)
