In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import xgboost as XGBRegressor
import pickle
import warnings
warnings.filterwarnings('ignore')



# Load the dataset
df = pd.read_csv('/content/Bengaluru_House_Data.csv')

# Display basic information
print("Dataset Information:")
print(f"Shape: {df.shape}")
df.info()
df.head()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Data Cleaning and Preprocessing
print("\n---- Data Cleaning ----")

# Handle missing values
df.dropna(inplace=True)
print(f"Shape after dropping nulls: {df.shape}")

# Convert size to numeric (assuming it's in format like '2 BHK', extracting the number)
df['bhk'] = df['size'].apply(lambda x: int(x.split()[0]) if isinstance(x, str) and len(x.split()) > 0 and x.split()[0].isdigit() else np.nan)
df.dropna(subset=['bhk'], inplace=True)
df['bhk'] = df['bhk'].astype(int)

# Extract total_sqft as numeric
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return np.nan

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df.dropna(subset=['total_sqft'], inplace=True)

# Create price per square feet
df['price_per_sqft'] = df['price'] * 100000 / df['total_sqft']

# Handle bath
df.dropna(subset=['bath'], inplace=True)
df['bath'] = df['bath'].astype(int)

# Remove outliers
print("\n---- Removing Outliers ----")

# Function to remove outliers based on standard deviation
def remove_outliers_std(df, column, n_std):
    mean = df[column].mean()
    std = df[column].std()
    df_out = df[(df[column] <= mean + (n_std * std)) & (df[column] >= mean - (n_std * std))]
    print(f"Removed {df.shape[0] - df_out.shape[0]} outliers from {column}")
    return df_out

# Remove entries where number of bathrooms > number of bedrooms + 2
df = df[df['bath'] <= df['bhk'] + 2]
print(f"Removed entries where bath > bhk + 2: {df.shape}")

# Remove outliers based on price_per_sqft
df = remove_outliers_std(df, 'price_per_sqft', 3)

# Filter out properties with too small area per bedroom
df = df[df['total_sqft']/df['bhk'] >= 300]
print(f"Removed entries with total_sqft/bhk < 300: {df.shape}")

# Feature Engineering
print("\n---- Feature Engineering ----")

# Create a new feature for price per bedroom
df['price_per_bedroom'] = df['price'] / df['bhk']

# Convert balcony to numeric
df['balcony'] = pd.to_numeric(df['balcony'], errors='coerce')
df.dropna(subset=['balcony'], inplace=True)
df['balcony'] = df['balcony'].astype(int)

# Exploratory Data Analysis
print("\n---- Exploratory Data Analysis ----")



# Distribution of prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True)
plt.title('Price Distribution')
plt.xlabel('Price (Lakhs)')
plt.ylabel('Frequency')
plt.savefig('price_distribution.png')
plt.close()

# Location vs Price
plt.figure(figsize=(16, 8))
locations = df.groupby('location')['price'].mean().sort_values(ascending=False).head(15).index
location_price = df[df['location'].isin(locations)].groupby('location')['price'].mean().sort_values(ascending=False)
sns.barplot(x=location_price.index, y=location_price.values)
plt.title('Top 15 Locations by Average Price')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('location_vs_price.png')
plt.close()

# Price vs Area type
plt.figure(figsize=(10, 6))
sns.boxplot(x='area_type', y='price', data=df)
plt.title('Price vs Area Type')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('area_type_vs_price.png')
plt.close()

# Prepare data for modeling
print("\n---- Preparing Data for Modeling ----")

# Select features to use
X = df[['area_type', 'location', 'bhk', 'total_sqft', 'bath', 'balcony']]
y = df['price']

# Count unique locations
location_counts = X['location'].value_counts()
# Group less frequent locations into 'Other'
threshold = 10  # Minimum number of properties to be a distinct location
X['location'] = X['location'].apply(lambda x: 'Other' if location_counts[x] < threshold else x)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

# Feature preprocessing
numeric_features = ['bhk', 'total_sqft', 'bath', 'balcony']
categorical_features = ['area_type', 'location']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Model Training and Evaluation
print("\n---- Model Training and Evaluation ----")

# Create pipelines for different models
pipeline_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

pipeline_gb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor.XGBRegressor(random_state=42))
])

# Function to evaluate models
def evaluate_model(model, name, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\nModel: {name}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")

    return {
        'model': model,
        'name': name,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

# Train and evaluate each model
models = [
    (pipeline_lr, "Linear Regression"),
    (pipeline_rf, "Random Forest"),
    (pipeline_gb, "Gradient Boosting"),
    (pipeline_xgb, "XGBoost")
]

results = []
for model, name in models:
    result = evaluate_model(model, name, X_train, X_test, y_train, y_test)
    results.append(result)

# Sort results by R²
results.sort(key=lambda x: x['r2'], reverse=True)
best_model = results[0]['model']
best_model_name = results[0]['name']

print(f"\nBest Model: {best_model_name}")
print(f"R² Score: {results[0]['r2']:.4f}")

# Hyperparameter tuning for the best model
print("\n---- Hyperparameter Tuning ----")

if best_model_name == "Random Forest":
    param_grid = {
        'regressor__n_estimators': [100, 200, 300],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10]
    }
    model_to_tune = pipeline_rf
elif best_model_name == "Gradient Boosting":
    param_grid = {
        'regressor__n_estimators': [100, 200, 300],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7]
    }
    model_to_tune = pipeline_gb
elif best_model_name == "XGBoost":
    param_grid = {
        'regressor__n_estimators': [100, 200, 300],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7]
    }
    model_to_tune = pipeline_xgb
else:  # Linear Regression doesn't have much to tune
    param_grid = {}
    model_to_tune = pipeline_lr

if param_grid:
    grid_search = GridSearchCV(model_to_tune, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    print(f"Best parameters: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_

    # Evaluate the tuned model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Tuned Model Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")
else:
    print("No hyperparameter tuning for Linear Regression.")

# Visualize predicted vs actual values
plt.figure(figsize=(10, 6))
y_pred = best_model.predict(X_test)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.xlabel('Actual Price (Lakhs)')
plt.ylabel('Predicted Price (Lakhs)')
plt.title('Actual vs Predicted Prices')
plt.savefig('actual_vs_predicted.png')
plt.close()

# Feature importance for tree-based models
if best_model_name in ["Random Forest", "Gradient Boosting", "XGBoost"]:
    # Extract feature names from the preprocessor
    cat_features = best_model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)
    feature_names = np.concatenate([numeric_features, cat_features])

    # Extract feature importances
    importances = best_model.named_steps['regressor'].feature_importances_

    # Plot the feature importances
    plt.figure(figsize=(12, 8))
    indices = np.argsort(importances)[-20:]  # Get indices of top 20 features
    plt.barh(range(len(indices)), importances[indices])
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    plt.xlabel('Feature Importance')
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

# Residual plot
plt.figure(figsize=(10, 6))
residuals = y_test - y_pred
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Price (Lakhs)')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.savefig('residual_plot.png')
plt.close()

# Deep Learning Model (Simple Neural Network)
print("\n---- Training Deep Learning Model ----")

# Only proceed if TensorFlow is available
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    # Preprocess the data
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Define the model
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=[X_train_preprocessed.shape[1]]),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(16, activation='relu'),
        layers.Dense(1)
    ])

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )

    # Early stopping
    early_stopping = keras.callbacks.EarlyStopping(
        patience=20,
        min_delta=0.001,
        restore_best_weights=True
    )

    # Train the model
    history = model.fit(
        X_train_preprocessed, y_train,
        validation_split=0.2,
        epochs=100,
        batch_size=32,
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluate the model
    y_pred_dl = model.predict(X_test_preprocessed).flatten()
    dl_mse = mean_squared_error(y_test, y_pred_dl)
    dl_rmse = np.sqrt(dl_mse)
    dl_mae = mean_absolute_error(y_test, y_pred_dl)
    dl_r2 = r2_score(y_test, y_pred_dl)

    print(f"\nDeep Learning Model Performance:")
    print(f"RMSE: {dl_rmse:.2f}")
    print(f"MAE: {dl_mae:.2f}")
    print(f"R²: {dl_r2:.4f}")

    # Plot learning curves
    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss (MSE)')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['mae'])
    plt.plot(history.history['val_mae'])
    plt.title('Model MAE')
    plt.ylabel('MAE')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')

    plt.tight_layout()
    plt.savefig('deep_learning_curves.png')
    plt.close()

    # Compare DL with best traditional ML model
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred_dl, alpha=0.5, label='Deep Learning')
    plt.scatter(y_test, y_pred, alpha=0.5, label=best_model_name)
    plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
    plt.xlabel('Actual Price (Lakhs)')
    plt.ylabel('Predicted Price (Lakhs)')
    plt.title('Traditional ML vs Deep Learning')
    plt.legend()
    plt.savefig('ml_vs_dl_comparison.png')
    plt.close()

    # Save DL model if it's the best
    if dl_r2 > results[0]['r2']:
        print("Deep Learning model outperforms traditional ML models.")
        # Save the TF model
        model.save('house_price_model_dl')
        # Save the preprocessor
        with open('preprocessor.pkl', 'wb') as f:
            pickle.dump(preprocessor, f)
        final_model_type = "Deep Learning"
    else:
        print(f"Traditional ML model ({best_model_name}) outperforms Deep Learning.")
        final_model_type = best_model_name

except ImportError:
    print("TensorFlow not available. Skipping deep learning model.")
    final_model_type = best_model_name

# Save the best model
print("\n---- Saving the Best Model ----")

# Save the best model (traditional ML)
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Model saved as 'model.pkl'")

# Save the preprocessor separately (useful for the API)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
print("Preprocessor saved as 'preprocessor.pkl'")

# Create a metadata file with model information
metadata = {
    'model_type': final_model_type,
    'features': list(X.columns),
    'performance': {
        'rmse': float(results[0]['rmse']),
        'mae': float(results[0]['mae']),
        'r2': float(results[0]['r2'])
    }
}

import json
with open('model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)
print("Model metadata saved as 'model_metadata.json'")

print("\n---- Model Building Complete ----")


Dataset Information:
Shape: (13320, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13319 non-null  object 
 2   size        13304 non-null  object 
 3   society     7818 non-null   object 
 4   total_sqft  13320 non-null  object 
 5   bath        13247 non-null  float64
 6   balcony     12711 non-null  float64
 7   price       13320 non-null  float64
dtypes: float64(3), object(5)
memory usage: 832.6+ KB

Missing Values:
area_type        0
location         1
size            16
society       5502
total_sqft       0
bath            73
balcony        609
price            0
dtype: int64

---- Data Cleaning ----
Shape after dropping nulls: (7496, 8)

---- Removing Outliers ----
Removed entries where bath > bhk + 2: (7479, 10)
Removed 6 outliers from price_per_sqft
Removed entries with total_