# 🏠 Boston Housing Price Predictor

**Project**: Regression - Predicting House Prices  
**Level**: Beginner  
**Dataset**: Boston Housing Dataset (Scikit-learn built-in)  

## 📋 Project Overview

In this project, we'll predict house prices in Boston using regression techniques. This is perfect for learning:

- Regression analysis fundamentals
- Feature importance and relationships
- Multiple regression algorithms
- Regression evaluation metrics
- Residual analysis

Let's build our first regression model! 🏡

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")

## 2. Data Loading and Exploration

In [None]:
# Load Boston Housing dataset
boston = load_boston()

# Create DataFrame
df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target

print("🏠 Boston Housing dataset loaded!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(boston.feature_names)}")
print(f"Target: MEDV (Median home value in $1000s)")

In [None]:
# Display first few rows
print("🔍 First 5 rows:")
df.head()

In [None]:
# Dataset information
print("📊 Dataset Information:")
print(f"Total houses: {len(df)}")
print(f"Features: {df.shape[1] - 1}")
print(f"Price range: ${df['MEDV'].min():.1f}k - ${df['MEDV'].max():.1f}k")
print(f"Average price: ${df['MEDV'].mean():.1f}k")
print(f"Missing values: {df.isnull().sum().sum()}")

print("\n📈 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Price distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Histogram
axes[0].hist(df['MEDV'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_title('🏠 Distribution of House Prices', fontweight='bold', fontsize=14)
axes[0].set_xlabel('Price ($1000s)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(df['MEDV'].mean(), color='red', linestyle='--', 
                label=f'Mean: ${df["MEDV"].mean():.1f}k')
axes[0].legend()

# Box plot
axes[1].boxplot(df['MEDV'])
axes[1].set_title('📦 House Price Box Plot', fontweight='bold', fontsize=14)
axes[1].set_ylabel('Price ($1000s)')

plt.tight_layout()
plt.show()

print(f"📊 Price Statistics:")
print(f"• Mean: ${df['MEDV'].mean():.1f}k")
print(f"• Median: ${df['MEDV'].median():.1f}k")
print(f"• Std Dev: ${df['MEDV'].std():.1f}k")
print(f"• Skewness: {df['MEDV'].skew():.2f}")

In [None]:
# Feature correlations with price
plt.figure(figsize=(12, 8))
correlations = df.corr()['MEDV'].sort_values(key=abs, ascending=False)[1:]

# Create color map
colors = ['red' if x < 0 else 'green' for x in correlations.values]

bars = plt.barh(range(len(correlations)), correlations.values, color=colors, alpha=0.7)
plt.yticks(range(len(correlations)), correlations.index)
plt.xlabel('Correlation with House Price')
plt.title('🔗 Feature Correlations with House Prices', fontweight='bold', fontsize=16)
plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + (0.01 if width > 0 else -0.01), bar.get_y() + bar.get_height()/2, 
             f'{width:.3f}', ha='left' if width > 0 else 'right', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

print("🔍 Top Positive Correlations:")
positive_corr = correlations[correlations > 0].head(3)
for feature, corr in positive_corr.items():
    print(f"• {feature}: {corr:.3f}")

print("\n🔍 Top Negative Correlations:")
negative_corr = correlations[correlations < 0].tail(3)
for feature, corr in negative_corr.items():
    print(f"• {feature}: {corr:.3f}")

## 4. Data Preprocessing

In [None]:
# Prepare features and target
X = df.drop('MEDV', axis=1)
y = df['MEDV']

print("🔧 Data preprocessing...")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {list(X.columns)}")

# Check for outliers using IQR method
Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = y[(y < lower_bound) | (y > upper_bound)]
print(f"\n📊 Outlier Analysis:")
print(f"• Total outliers: {len(outliers)} ({len(outliers)/len(y):.1%})")
print(f"• Price range: ${y.min():.1f}k - ${y.max():.1f}k")
print(f"• IQR bounds: ${lower_bound:.1f}k - ${upper_bound:.1f}k")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✂️ Data split completed:")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Training target mean: ${y_train.mean():.1f}k")
print(f"Testing target mean: ${y_test.mean():.1f}k")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n⚖️ Feature scaling completed!")
print(f"Original feature means: {X_train.mean().round(2).tolist()[:3]}...")
print(f"Scaled feature means: {X_train_scaled.mean(axis=0).round(2).tolist()[:3]}...")