# Real Estate Price Prediction - Ahmedabad Real Data Training 🏠🇮🇳

## Overview
This notebook trains machine learning models on **real Ahmedabad property data** from the provided CSV file. The dataset contains actual property listings with prices in Indian Rupees.

### Dataset Information
- **Source**: Real Ahmedabad property listings
- **Size**: 6,855+ properties
- **Currency**: Indian Rupees (₹)
- **Areas**: Vastrapur, Bopal, Shela, Satellite, Maninagar, and more
- **Features**: BHK, area, location, price, furnishing, status

---

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

# Import our custom data processor
import sys
sys.path.append('../src')
from real_data_processor import RealDataProcessor

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🏠 Ready to process real Ahmedabad property data!")

In [None]:
# Load and Process Real Ahmedabad Data
print("🏠 Loading Real Ahmedabad Property Data...")

# Load the real dataset
df_raw = pd.read_csv('../data/ahmedabad.csv')
print(f"✅ Loaded {len(df_raw)} real properties from CSV")

# Display raw data info
print("\n📊 Raw Dataset Info:")
print(f"Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

# Show sample raw data
print("\n🔍 Sample Raw Data:")
print(df_raw.head())

# Check for missing values
print("\n❓ Missing Values:")
print(df_raw.isnull().sum())

In [None]:
# Process the Real Data
print("🔧 Processing real Ahmedabad property data...")

# Initialize processor
processor = RealDataProcessor()

# Process the raw data
df = processor.process_real_ahmedabad_data(df_raw)

print(f"\n📊 Processed dataset shape: {df.shape}")
print(f"💰 Price range: ₹{df['price'].min():,.0f} - ₹{df['price'].max():,.0f}")
print(f"🏘️ Areas covered: {', '.join(df['neighborhood'].unique()[:8])}...")

# Display processed data info
print("\n🔍 Processed Data Sample:")
print(df[['price', 'bedrooms', 'square_feet', 'neighborhood', 'property_type']].head(10))

# Basic statistics
print("\n📈 Basic Statistics:")
print(df[['price', 'bedrooms', 'bathrooms', 'square_feet', 'property_age']].describe())

In [None]:
# Exploratory Data Analysis
print("📊 Performing Exploratory Data Analysis...")

# Create subplots for EDA
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Ahmedabad Real Estate - Exploratory Data Analysis', fontsize=16, fontweight='bold')

# Price distribution
axes[0, 0].hist(df['price']/100000, bins=50, alpha=0.7, color='skyblue')
axes[0, 0].set_title('Price Distribution (in Lac ₹)')
axes[0, 0].set_xlabel('Price (Lac ₹)')
axes[0, 0].set_ylabel('Frequency')

# Bedrooms distribution
bedroom_counts = df['bedrooms'].value_counts().sort_index()
axes[0, 1].bar(bedroom_counts.index, bedroom_counts.values, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('Bedrooms Distribution')
axes[0, 1].set_xlabel('Number of Bedrooms')
axes[0, 1].set_ylabel('Count')

# Square feet distribution
axes[0, 2].hist(df['square_feet'], bins=50, alpha=0.7, color='salmon')
axes[0, 2].set_title('Square Feet Distribution')
axes[0, 2].set_xlabel('Square Feet')
axes[0, 2].set_ylabel('Frequency')

# Price by neighborhood (top 10)
top_neighborhoods = df['neighborhood'].value_counts().head(10).index
neighborhood_prices = df[df['neighborhood'].isin(top_neighborhoods)].groupby('neighborhood')['price'].mean().sort_values(ascending=False)
axes[1, 0].bar(range(len(neighborhood_prices)), neighborhood_prices.values/100000, alpha=0.7, color='gold')
axes[1, 0].set_title('Average Price by Neighborhood (Top 10)')
axes[1, 0].set_xlabel('Neighborhood')
axes[1, 0].set_ylabel('Average Price (Lac ₹)')
axes[1, 0].set_xticks(range(len(neighborhood_prices)))
axes[1, 0].set_xticklabels(neighborhood_prices.index, rotation=45, ha='right')

# Property type distribution
prop_type_counts = df['property_type'].value_counts()
axes[1, 1].pie(prop_type_counts.values, labels=prop_type_counts.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Property Type Distribution')

# Price vs Square Feet scatter
axes[1, 2].scatter(df['square_feet'], df['price']/100000, alpha=0.5, color='purple')
axes[1, 2].set_title('Price vs Square Feet')
axes[1, 2].set_xlabel('Square Feet')
axes[1, 2].set_ylabel('Price (Lac ₹)')

plt.tight_layout()
plt.show()

# Print key insights
print(f"\n🔍 Key Insights:")
print(f"   💰 Average price: ₹{df['price'].mean():,.0f} ({df['price'].mean()/100000:.1f} Lac)")
print(f"   🏠 Most common: {df['bedrooms'].mode()[0]:.0f} BHK properties")
print(f"   📐 Average size: {df['square_feet'].mean():.0f} sq ft")
print(f"   🏘️ Most expensive area: {neighborhood_prices.index[0]}")
print(f"   🏗️ Most common type: {df['property_type'].mode()[0]}")

In [None]:
# Prepare Data for Machine Learning
print("🤖 Preparing data for machine learning...")

# Prepare features
X, y = processor.prepare_features_for_ml(df)
print(f"✅ Features prepared: {X.shape[1]} features, {len(X)} samples")
print(f"📊 Feature columns: {list(X.columns)}")

# Split the data
X_train, X_test, y_train, y_test = processor.create_train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\n📊 Data split:")
print(f"   🎯 Training set: {len(X_train)} samples")
print(f"   🧪 Test set: {len(X_test)} samples")

# Scale features for some algorithms
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✅ Data preprocessing complete!")
print(f"   💰 Training price range: ₹{y_train.min():,.0f} - ₹{y_train.max():,.0f}")
print(f"   🎯 Test price range: ₹{y_test.min():,.0f} - ₹{y_test.max():,.0f}")

In [None]:
# Train Multiple ML Models
print("🤖 Training Multiple ML Models on Real Ahmedabad Data...")
print("="*60)

# Define models to compare
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector Regression': SVR(kernel='rbf')
}

# Store results
results = {}

# Train and evaluate all models
for name, model in models.items():
    print(f"\n🤖 Training {name}...")
    
    # Use scaled data for SVR and linear models
    if name in ['Support Vector Regression', 'Ridge Regression', 'Lasso Regression']:
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    else:
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    # Calculate MAPE (Mean Absolute Percentage Error)
    test_mape = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100
    
    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'test_mae': test_mae,
        'test_rmse': test_rmse,
        'test_mape': test_mape,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'model': model
    }
    
    print(f"   📊 Train R²: {train_r2:.4f}")
    print(f"   📈 Test R²: {test_r2:.4f}")
    print(f"   💰 Test MAE: ₹{test_mae:,.0f} ({test_mae/100000:.1f} Lac)")
    print(f"   📊 Test MAPE: {test_mape:.1f}%")
    print(f"   🎯 CV Score: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

print(f"\n✅ All models trained successfully!")

In [None]:
# Model Performance Comparison and Analysis
print("📊 Model Performance Analysis...")

# Create performance comparison DataFrame
performance_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test_R2': [results[name]['test_r2'] for name in results.keys()],
    'Test_MAE': [results[name]['test_mae'] for name in results.keys()],
    'Test_RMSE': [results[name]['test_rmse'] for name in results.keys()],
    'Test_MAPE': [results[name]['test_mape'] for name in results.keys()],
    'CV_Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV_Std': [results[name]['cv_std'] for name in results.keys()]
})

performance_df = performance_df.sort_values('Test_R2', ascending=False)
print("\n🏆 Model Performance Ranking:")
print(performance_df.round(4))

# Find best model
best_model_name = performance_df.iloc[0]['Model']
best_model = results[best_model_name]['model']

print(f"\n🥇 Best Model: {best_model_name}")
print(f"   📈 Test R²: {results[best_model_name]['test_r2']:.4f}")
print(f"   💰 Test MAE: ₹{results[best_model_name]['test_mae']:,.0f}")
print(f"   📊 Test MAPE: {results[best_model_name]['test_mape']:.1f}%")

# Visualize model performance
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# R² Score comparison
axes[0].bar(performance_df['Model'], performance_df['Test_R2'], alpha=0.7, color='skyblue')
axes[0].set_title('Model R² Score Comparison')
axes[0].set_ylabel('R² Score')
axes[0].tick_params(axis='x', rotation=45)

# MAE comparison
axes[1].bar(performance_df['Model'], performance_df['Test_MAE']/100000, alpha=0.7, color='lightgreen')
axes[1].set_title('Model MAE Comparison')
axes[1].set_ylabel('MAE (Lac ₹)')
axes[1].tick_params(axis='x', rotation=45)

# MAPE comparison
axes[2].bar(performance_df['Model'], performance_df['Test_MAPE'], alpha=0.7, color='salmon')
axes[2].set_title('Model MAPE Comparison')
axes[2].set_ylabel('MAPE (%)')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Analysis
print("🔍 Feature Importance Analysis...")

# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n🔍 Top 15 Most Important Features ({best_model_name}):")
    print(feature_importance.head(15))
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    plt.barh(top_features['feature'], top_features['importance'], alpha=0.7, color='gold')
    plt.title(f'Top 15 Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    # Feature importance insights
    print(f"\n💡 Key Insights:")
    print(f"   🥇 Most important: {feature_importance.iloc[0]['feature']} ({feature_importance.iloc[0]['importance']:.3f})")
    print(f"   🥈 Second most: {feature_importance.iloc[1]['feature']} ({feature_importance.iloc[1]['importance']:.3f})")
    print(f"   🥉 Third most: {feature_importance.iloc[2]['feature']} ({feature_importance.iloc[2]['importance']:.3f})")

else:
    print(f"\n⚠️ Feature importance not available for {best_model_name}")
    
    # For linear models, show coefficients
    if hasattr(best_model, 'coef_'):
        coef_importance = pd.DataFrame({
            'feature': X.columns,
            'coefficient': best_model.coef_
        })
        coef_importance['abs_coefficient'] = np.abs(coef_importance['coefficient'])
        coef_importance = coef_importance.sort_values('abs_coefficient', ascending=False)
        
        print(f"\n📊 Top 10 Features by Coefficient Magnitude ({best_model_name}):")
        print(coef_importance.head(10))

In [None]:
# Model Predictions Visualization
print("📊 Visualizing Model Predictions...")

# Get predictions from best model
if best_model_name in ['Support Vector Regression', 'Ridge Regression', 'Lasso Regression']:
    y_pred_best = best_model.predict(X_test_scaled)
else:
    y_pred_best = best_model.predict(X_test)

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle(f'Model Prediction Analysis - {best_model_name}', fontsize=16, fontweight='bold')

# Actual vs Predicted scatter plot
axes[0, 0].scatter(y_test/100000, y_pred_best/100000, alpha=0.6, color='blue')
axes[0, 0].plot([y_test.min()/100000, y_test.max()/100000], 
                [y_test.min()/100000, y_test.max()/100000], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Price (Lac ₹)')
axes[0, 0].set_ylabel('Predicted Price (Lac ₹)')
axes[0, 0].set_title('Actual vs Predicted Prices')
axes[0, 0].text(0.05, 0.95, f'R² = {results[best_model_name]["test_r2"]:.4f}', 
                transform=axes[0, 0].transAxes, fontsize=12, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Residuals plot
residuals = y_test - y_pred_best
axes[0, 1].scatter(y_pred_best/100000, residuals/100000, alpha=0.6, color='green')
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Price (Lac ₹)')
axes[0, 1].set_ylabel('Residuals (Lac ₹)')
axes[0, 1].set_title('Residuals Plot')

# Prediction error distribution
axes[1, 0].hist(residuals/100000, bins=30, alpha=0.7, color='orange')
axes[1, 0].set_xlabel('Prediction Error (Lac ₹)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Prediction Error Distribution')
axes[1, 0].axvline(x=0, color='r', linestyle='--')

# Percentage error distribution
percentage_errors = ((y_test - y_pred_best) / y_test) * 100
axes[1, 1].hist(percentage_errors, bins=30, alpha=0.7, color='purple')
axes[1, 1].set_xlabel('Percentage Error (%)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Percentage Error Distribution')
axes[1, 1].axvline(x=0, color='r', linestyle='--')

plt.tight_layout()
plt.show()

# Print prediction statistics
print(f"\n📊 Prediction Statistics:")
print(f"   📈 Mean Absolute Error: ₹{results[best_model_name]['test_mae']:,.0f}")
print(f"   📊 Root Mean Square Error: ₹{results[best_model_name]['test_rmse']:,.0f}")
print(f"   🎯 Mean Absolute Percentage Error: {results[best_model_name]['test_mape']:.1f}%")
print(f"   📈 R² Score: {results[best_model_name]['test_r2']:.4f}")

In [None]:
# Sample Predictions with Real Property Details
print("🏠 Testing Model with Sample Predictions...")

# Select random samples for testing
np.random.seed(42)
sample_indices = np.random.choice(X_test.index, size=10, replace=False)
sample_X = X_test.loc[sample_indices]
sample_y_actual = y_test.loc[sample_indices]

if best_model_name in ['Support Vector Regression', 'Ridge Regression', 'Lasso Regression']:
    sample_y_pred = best_model.predict(scaler.transform(sample_X))
else:
    sample_y_pred = best_model.predict(sample_X)

print("\n🏠 Sample Predictions vs Actual (Real Ahmedabad Properties):")
print("="*80)

for i, idx in enumerate(sample_indices):
    actual = sample_y_actual.loc[idx]
    predicted = sample_y_pred[i]
    error = abs(actual - predicted)
    error_pct = (error / actual) * 100
    
    # Get property details from processed dataframe
    property_details = df.loc[idx]
    
    print(f"\n🏠 Property {i+1}:")
    print(f"   📍 Location: {property_details['neighborhood']}")
    print(f"   🏠 Type: {property_details['property_type']}")
    print(f"   🛏️ Bedrooms: {property_details['bedrooms']:.0f} BHK")
    print(f"   📐 Size: {property_details['square_feet']:.0f} sq ft")
    print(f"   🏗️ Age: {property_details['property_age']:.0f} years")
    print(f"   💰 Actual Price: ₹{actual:,.0f} ({actual/100000:.1f} Lac)")
    print(f"   🤖 Predicted Price: ₹{predicted:,.0f} ({predicted/100000:.1f} Lac)")
    print(f"   📊 Error: ₹{error:,.0f} ({error_pct:.1f}%)")
    
    # Add accuracy indicator
    if error_pct < 10:
        print(f"   ✅ Excellent prediction!")
    elif error_pct < 20:
        print(f"   👍 Good prediction")
    elif error_pct < 30:
        print(f"   ⚠️ Fair prediction")
    else:
        print(f"   ❌ Poor prediction")

# Calculate sample statistics
sample_mae = mean_absolute_error(sample_y_actual, sample_y_pred)
sample_mape = np.mean(np.abs((sample_y_actual - sample_y_pred) / sample_y_actual)) * 100
sample_r2 = r2_score(sample_y_actual, sample_y_pred)

print(f"\n📊 Sample Performance:")
print(f"   💰 Sample MAE: ₹{sample_mae:,.0f} ({sample_mae/100000:.1f} Lac)")
print(f"   📊 Sample MAPE: {sample_mape:.1f}%")
print(f"   📈 Sample R²: {sample_r2:.4f}")

In [None]:
# Save the Best Model and Results
print("💾 Saving Model and Results...")

# Create models directory if it doesn't exist
import os
os.makedirs('../models', exist_ok=True)

# Save the best model
model_filename = f'../models/best_ahmedabad_model_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model, model_filename)
print(f"✅ Best model saved: {model_filename}")

# Save the scaler
scaler_filename = '../models/feature_scaler.pkl'
joblib.dump(scaler, scaler_filename)
print(f"✅ Feature scaler saved: {scaler_filename}")

# Save feature names
feature_names_filename = '../models/feature_names.pkl'
joblib.dump(list(X.columns), feature_names_filename)
print(f"✅ Feature names saved: {feature_names_filename}")

# Save model performance results
results_filename = '../models/model_performance_results.csv'
performance_df.to_csv(results_filename, index=False)
print(f"✅ Performance results saved: {results_filename}")

# Create a model summary
model_summary = {
    'best_model': best_model_name,
    'test_r2': results[best_model_name]['test_r2'],
    'test_mae': results[best_model_name]['test_mae'],
    'test_mape': results[best_model_name]['test_mape'],
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'features_count': X.shape[1],
    'dataset_size': len(df),
    'price_range_min': float(df['price'].min()),
    'price_range_max': float(df['price'].max())
}

import json
summary_filename = '../models/model_summary.json'
with open(summary_filename, 'w') as f:
    json.dump(model_summary, f, indent=2)
print(f"✅ Model summary saved: {summary_filename}")

print(f"\n🎉 Model Training Complete!")
print(f"\n📊 Final Results Summary:")
print(f"   🥇 Best Model: {best_model_name}")
print(f"   📈 R² Score: {results[best_model_name]['test_r2']:.4f}")
print(f"   💰 Mean Absolute Error: ₹{results[best_model_name]['test_mae']:,.0f}")
print(f"   📊 Mean Absolute Percentage Error: {results[best_model_name]['test_mape']:.1f}%")
print(f"   🏠 Trained on {len(df)} real Ahmedabad properties")
print(f"   🔍 Using {X.shape[1]} features")
print(f"\n💾 All model artifacts saved in '../models/' directory")
print(f"🚀 Ready for deployment and predictions!")