# üè† House Price Prediction - Complete Project
## End-to-End Machine Learning Project

### Project Overview
- **Objective**: Predict house prices based on property features
- **Dataset**: Housing data with various features
- **Approach**: Regression modeling with multiple algorithms
- **Deployment**: Streamlit web application

---
## Part 1: Setup and Data Loading

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ All libraries imported successfully!")

In [None]:
# Load the dataset
df = pd.read_csv('house_data.csv')
print(f"Dataset Shape: {df.shape}")
print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns")
df.head()

---
## Part 2: Exploratory Data Analysis (EDA)

### 2.1 Basic Data Information

In [None]:
# Dataset information
print("Dataset Information:")
print("="*60)
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
print("="*60)
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print("="*60)
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percentage': missing_pct})
print(missing_df[missing_df['Missing'] > 0])
print(f"\nTotal missing values: {missing.sum()}")

### 2.2 Target Variable Analysis

In [None]:
# Analyze target variable (Price)
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
axes[0].hist(df['Price'], bins=30, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Price ($)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Price Distribution', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df['Price'], vert=True)
axes[1].set_ylabel('Price ($)', fontsize=12)
axes[1].set_title('Price Box Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Statistics
price_stats = df['Price'].describe()
axes[2].axis('off')
stats_text = f"""
Price Statistics:

Mean:     ${price_stats['mean']:,.2f}
Median:   ${price_stats['50%']:,.2f}
Std Dev:  ${price_stats['std']:,.2f}
Min:      ${price_stats['min']:,.2f}
Max:      ${price_stats['max']:,.2f}
"""
axes[2].text(0.1, 0.5, stats_text, fontsize=12, family='monospace',
            verticalalignment='center')
axes[2].set_title('Price Statistics', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

### 2.3 Feature Distributions

In [None]:
# Numerical features distribution
numerical_cols = ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].hist(df[col], bins=20, color='lightcoral', edgecolor='black', alpha=0.7)
    axes[idx].set_xlabel(col, fontsize=11)
    axes[idx].set_ylabel('Frequency', fontsize=11)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3)

axes[-1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Categorical features distribution
categorical_cols = ['Location', 'Condition', 'Garage']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, col in enumerate(categorical_cols):
    value_counts = df[col].value_counts()
    axes[idx].bar(value_counts.index, value_counts.values, color='lightgreen', edgecolor='black')
    axes[idx].set_xlabel(col, fontsize=12)
    axes[idx].set_ylabel('Count', fontsize=12)
    axes[idx].set_title(f'Distribution of {col}', fontsize=13, fontweight='bold')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

### 2.4 Correlation Analysis

In [None]:
# Correlation heatmap
numerical_features = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlGn', center=0,
            square=True, linewidths=2, cbar_kws={"shrink": 0.8}, fmt='.2f')
plt.title('Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable
price_corr = df[numerical_features].corr()['Price'].sort_values(ascending=False)
print("Correlation with Price:")
print("="*60)
print(price_corr)

# Visualize correlations
plt.figure(figsize=(10, 6))
price_corr.drop('Price').plot(kind='barh', color=['green' if x > 0 else 'red' for x in price_corr.drop('Price')])
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.title('Feature Correlation with Price', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

### 2.5 Feature Relationships

In [None]:
# Scatter plots: Features vs Price
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].scatter(df[col], df['Price'], alpha=0.5, s=30)
    axes[idx].set_xlabel(col, fontsize=11)
    axes[idx].set_ylabel('Price ($)', fontsize=11)
    axes[idx].set_title(f'{col} vs Price', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(df[col], df['Price'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[col], p(df[col]), "r--", alpha=0.8, linewidth=2)

axes[-1].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Box plots: Categorical features vs Price
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, col in enumerate(categorical_cols):
    df.boxplot(column='Price', by=col, ax=axes[idx])
    axes[idx].set_xlabel(col, fontsize=12)
    axes[idx].set_ylabel('Price ($)', fontsize=12)
    axes[idx].set_title(f'{col} vs Price', fontsize=13, fontweight='bold')
    axes[idx].get_figure().suptitle('')

plt.tight_layout()
plt.show()

---
## Part 3: Data Preprocessing

### 3.1 Data Cleaning

In [None]:
# Create a copy for preprocessing
df_clean = df.copy()

# Drop Id column if exists
if 'Id' in df_clean.columns:
    df_clean = df_clean.drop('Id', axis=1)
    print("‚úÖ Dropped 'Id' column")

# Check for missing values
print(f"\nMissing values: {df_clean.isnull().sum().sum()}")

# Handle missing values (if any)
for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if df_clean[col].dtype in ['int64', 'float64']:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
        else:
            df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

print(f"\n‚úÖ Dataset cleaned: {df_clean.shape}")

### 3.2 Feature Engineering

In [None]:
# Create new features
df_clean['Age'] = 2025 - df_clean['YearBuilt']
df_clean['TotalRooms'] = df_clean['Bedrooms'] + df_clean['Bathrooms']
df_clean['Area_per_Room'] = df_clean['Area'] / df_clean['TotalRooms']

print("‚úÖ New features created:")
print("   - Age (house age in years)")
print("   - TotalRooms (bedrooms + bathrooms)")
print("   - Area_per_Room (area divided by total rooms)")

print(f"\nDataset shape after feature engineering: {df_clean.shape}")
df_clean.head()

### 3.3 Encode Categorical Variables

In [None]:
# Label encoding
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}

print("Encoding categorical variables:")
print("="*60)

for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le
    print(f"\n{col}:")
    for i, class_name in enumerate(le.classes_):
        print(f"  {class_name} ‚Üí {i}")

# Save encoders
joblib.dump(label_encoders, 'label_encoders.pkl')
print("\n‚úÖ Label encoders saved")

---
## Part 4: Model Training and Evaluation

### 4.1 Prepare Data

In [None]:
# Separate features and target
X = df_clean.drop('Price', axis=1)
y = df_clean['Price']

print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train-Test Split:")
print("="*60)
print(f"Training set:   {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set:       {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, 'scaler.pkl')
print("‚úÖ Features scaled using StandardScaler")
print("‚úÖ Scaler saved")

### 4.2 Train Multiple Models

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

print("Training Models:")
print("="*60)

In [None]:
# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training: {name}")
    print('='*60)
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Store results
    results[name] = {
        'Train MSE': train_mse, 'Test MSE': test_mse,
        'Train RMSE': train_rmse, 'Test RMSE': test_rmse,
        'Train MAE': train_mae, 'Test MAE': test_mae,
        'Train R¬≤': train_r2, 'Test R¬≤': test_r2
    }
    
    # Print metrics
    print(f"\nüìä Training Metrics:")
    print(f"   MSE:  ${train_mse:,.2f}")
    print(f"   RMSE: ${train_rmse:,.2f}")
    print(f"   MAE:  ${train_mae:,.2f}")
    print(f"   R¬≤:   {train_r2:.4f}")
    
    print(f"\nüìä Test Metrics:")
    print(f"   MSE:  ${test_mse:,.2f}")
    print(f"   RMSE: ${test_rmse:,.2f}")
    print(f"   MAE:  ${test_mae:,.2f}")
    print(f"   R¬≤:   {test_r2:.4f}")

print("\n" + "="*60)
print("‚úÖ All models trained successfully!")
print("="*60)

### 4.3 Model Comparison

In [None]:
# Results dataframe
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print("="*60)
print(results_df)

# Best model
best_model_name = results_df['Test R¬≤'].idxmax()
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test R¬≤: {results_df.loc[best_model_name, 'Test R¬≤']:.4f}")

In [None]:
# Visualization of model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R¬≤ Score
axes[0, 0].bar(results_df.index, results_df['Test R¬≤'], 
               color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0, 0].set_title('R¬≤ Score Comparison', fontsize=13, fontweight='bold')
axes[0, 0].set_ylabel('R¬≤ Score')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3, axis='y')

# RMSE
axes[0, 1].bar(results_df.index, results_df['Test RMSE'],
               color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0, 1].set_title('RMSE Comparison', fontsize=13, fontweight='bold')
axes[0, 1].set_ylabel('RMSE ($)')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(True, alpha=0.3, axis='y')

# MAE
axes[1, 0].bar(results_df.index, results_df['Test MAE'],
               color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1, 0].set_title('MAE Comparison', fontsize=13, fontweight='bold')
axes[1, 0].set_ylabel('MAE ($)')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Train vs Test R¬≤
x = np.arange(len(results_df))
width = 0.35
axes[1, 1].bar(x - width/2, results_df['Train R¬≤'], width, 
              label='Train', color='#95E1D3')
axes[1, 1].bar(x + width/2, results_df['Test R¬≤'], width,
              label='Test', color='#F38181')
axes[1, 1].set_title('Train vs Test R¬≤', fontsize=13, fontweight='bold')
axes[1, 1].set_ylabel('R¬≤ Score')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(results_df.index)
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

### 4.4 Save Best Model

In [None]:
# Select and save best model
best_model = models[best_model_name]
joblib.dump(best_model, 'house_price_model.pkl')

# Save feature names
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.pkl')

print("\n‚úÖ Model and artifacts saved:")
print("   - house_price_model.pkl")
print("   - scaler.pkl")
print("   - label_encoders.pkl")
print("   - feature_names.pkl")

### 4.5 Prediction Analysis

In [None]:
# Get predictions
y_pred = best_model.predict(X_test_scaled)

# Prediction analysis
pred_analysis = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred,
    'Error': y_test.values - y_pred,
    'Abs Error': np.abs(y_test.values - y_pred),
    'Pct Error': np.abs((y_test.values - y_pred) / y_test.values) * 100
})

print("Sample Predictions:")
print("="*60)
print(pred_analysis.head(10))
print(f"\nAverage Absolute Error: ${pred_analysis['Abs Error'].mean():,.2f}")
print(f"Average Percentage Error: {pred_analysis['Pct Error'].mean():.2f}%")

In [None]:
# Visualize predictions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Actual vs Predicted
axes[0].scatter(y_test, y_pred, alpha=0.6, s=50)
axes[0].plot([y_test.min(), y_test.max()], 
             [y_test.min(), y_test.max()], 
             'r--', lw=3, label='Perfect Prediction')
axes[0].set_xlabel('Actual Price ($)', fontsize=12)
axes[0].set_ylabel('Predicted Price ($)', fontsize=12)
axes[0].set_title('Actual vs Predicted Prices', fontsize=13, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Residuals
residuals = y_test - y_pred
axes[1].scatter(y_pred, residuals, alpha=0.6, s=50)
axes[1].axhline(y=0, color='r', linestyle='--', lw=3)
axes[1].set_xlabel('Predicted Price ($)', fontsize=12)
axes[1].set_ylabel('Residuals ($)', fontsize=12)
axes[1].set_title('Residual Plot', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Error distribution
axes[2].hist(pred_analysis['Pct Error'], bins=30, color='skyblue', edgecolor='black')
axes[2].axvline(pred_analysis['Pct Error'].mean(), 
               color='red', linestyle='--', lw=2, 
               label=f'Mean: {pred_analysis["Pct Error"].mean():.2f}%')
axes[2].set_xlabel('Percentage Error (%)', fontsize=12)
axes[2].set_ylabel('Frequency', fontsize=12)
axes[2].set_title('Distribution of Percentage Errors', fontsize=13, fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

---
## Part 5: Project Summary

In [None]:
print("\n" + "="*70)
print("üéâ PROJECT SUMMARY")
print("="*70)
print(f"\nüìä Dataset Information:")
print(f"   Total samples: {len(df)}")
print(f"   Training samples: {len(X_train)}")
print(f"   Test samples: {len(X_test)}")
print(f"   Number of features: {X.shape[1]}")
print(f"\nüîß Features Used:")
for i, feature in enumerate(X.columns, 1):
    print(f"   {i}. {feature}")
print(f"\nüèÜ Best Model: {best_model_name}")
print(f"\nüìà Performance Metrics (Test Set):")
print(f"   R¬≤ Score:  {results_df.loc[best_model_name, 'Test R¬≤']:.4f}")
print(f"   RMSE:      ${results_df.loc[best_model_name, 'Test RMSE']:,.2f}")
print(f"   MAE:       ${results_df.loc[best_model_name, 'Test MAE']:,.2f}")
print(f"   Avg Error: {pred_analysis['Pct Error'].mean():.2f}%")
print(f"\nüíæ Files Saved:")
print(f"   ‚úì house_price_model.pkl - Trained model")
print(f"   ‚úì scaler.pkl - Feature scaler")
print(f"   ‚úì label_encoders.pkl - Categorical encoders")
print(f"   ‚úì feature_names.pkl - Feature names")
print(f"\nüöÄ Next Steps:")
print(f"   1. Deploy the model using Streamlit: streamlit run app.py")
print(f"   2. Test the model with new data")
print(f"   3. Monitor model performance over time")
print(f"   4. Consider hyperparameter tuning for improvement")
print("\n" + "="*70)
print("‚úÖ Project completed successfully!")
print("="*70)