In [1]:
# -----------------------------
# FINAL REAL ESTATE PREDICTION + ENRICHMENT PROJECT (A–Z)
# -----------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import shap
import plotly.express as px
import gradio as gr
import xgboost as xgb
from geopy.distance import geodesic
from google.colab import files
import io

In [2]:
# -----------------------------
# STEP 1: FILE UPLOAD + DATA LOADING
# -----------------------------
print("\n📤 Please upload property.csv, uber.csv, restaurant.csv, crime.csv, and entertainment.csv")
uploaded = files.upload()
external_data = {}

for filename in uploaded.keys():
    try:
        if filename.lower().endswith('.xlsx'):
            content = pd.read_excel(io.BytesIO(uploaded[filename]))
        else:
            content = pd.read_csv(io.BytesIO(uploaded[filename]))
    except UnicodeDecodeError:
        content = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='latin1')
    except pd.errors.ParserError:
        print(f"❌ Parser error in {filename}. Skipping.")
        continue

    if 'uber' in filename.lower():
        df_uber = content
    elif 'crime' in filename.lower():
        external_data['crime'] = content
    elif 'entertain' in filename.lower():
        external_data['entertainment'] = content
    elif 'restaurant' in filename.lower():
        external_data['restaurant'] = content
    elif 'property' in filename.lower() or 'house' in filename.lower():
        df = content



📤 Please upload property.csv, uber.csv, restaurant.csv, crime.csv, and entertainment.csv


Saving Bengaluru_Restaurants.csv to Bengaluru_Restaurants.csv
Saving South Crime Details.xlsx to South Crime Details.xlsx
Saving indian-movie-theatres.txt to indian-movie-theatres.txt
Saving uber.csv to uber.csv
Saving Bengaluru_House_Data.csv to Bengaluru_House_Data.csv


  warn(msg)


In [None]:

# -----------------------------
# STEP 2: CLEAN PROPERTY DATA
# -----------------------------
df.dropna(subset=['location', 'size', 'total_sqft', 'price'], inplace=True)
df['location'] = df['location'].astype(str).str.strip()
df['bhk'] = df['size'].str.extract(r'(\d+)').astype(float)
df['total_sqft_clean'] = df['total_sqft'].apply(lambda x: np.mean([float(i) for i in str(x).split('-')]) if '-' in str(x) else pd.to_numeric(x, errors='coerce'))
df.dropna(subset=['total_sqft_clean'], inplace=True)
df['price_per_sqft'] = (df['price'] * 1e5) / df['total_sqft_clean']
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(0)

In [None]:
# -----------------------------
# STEP 3: MERGE EXTERNAL FEATURES
# -----------------------------
df_features = df.copy()
df_features['location'] = df_features['location'].astype(str).str.strip()

if 'crime' in external_data and 'location' in external_data['crime'].columns:
    external_data['crime']['location'] = external_data['crime']['location'].astype(str).str.strip()
    df_features = df_features.merge(external_data['crime'], on='location', how='left')
else:
    df_features['crime_rate'] = np.random.uniform(2, 10, len(df_features))

if 'entertainment' in external_data and 'location' in external_data['entertainment'].columns:
    external_data['entertainment']['location'] = external_data['entertainment']['location'].astype(str).str.strip()
    df_features = df_features.merge(external_data['entertainment'], on='location', how='left')
else:
    df_features['entertainment_centers'] = np.random.randint(1, 10, len(df_features))

if 'restaurant' in external_data and 'location' in external_data['restaurant'].columns:
    external_data['restaurant']['location'] = external_data['restaurant']['location'].astype(str).str.strip()
    rest_density = external_data['restaurant'].groupby('location').size().reset_index(name='restaurant_count')
    df_features = df_features.merge(rest_density, on='location', how='left')
else:
    df_features['restaurant_count'] = np.random.randint(5, 50, len(df_features))

crime_max = df_features['crime_rate'].max()
df_features['crime_score'] = (crime_max - df_features['crime_rate']) / crime_max * 10
df_features['entertainment_score'] = df_features['entertainment_centers'] / df_features['entertainment_centers'].max() * 10
rest_max = df_features['restaurant_count'].max()
df_features['restaurant_score'] = df_features['restaurant_count'] / rest_max * 10


In [None]:
# -----------------------------
# STEP 4: GEO-PROXIMITY SCORE
# -----------------------------
ref_point = (12.9716, 77.5946)
def dummy_coordinates(location):
    return (12.9 + hash(location)%10/100, 77.5 + hash(location)%10/100)

df_features['distance_to_center'] = df_features['location'].apply(lambda loc: geodesic(ref_point, dummy_coordinates(loc)).km)
dist_max = df_features['distance_to_center'].max()
df_features['proximity_score'] = (dist_max - df_features['distance_to_center']) / dist_max * 10

In [None]:
# -----------------------------
# STEP 5: ADVANCED FEATURES
# -----------------------------
df_features['room_density'] = df_features['total_sqft_clean'] / (df_features['bhk'] + df_features['bath'])
df_features['popularity_score'] = df_features.groupby('location')['price'].transform('count')
df_features['popularity_score'] = df_features['popularity_score'] / df_features['popularity_score'].max() * 10
df_features['volatility_score'] = df_features.groupby('location')['price'].transform('std')
df_features['volatility_score'] = df_features['volatility_score'].fillna(0)

In [None]:
# -----------------------------
# STEP 6: MODEL TRAINING
# -----------------------------
features = ['total_sqft_clean', 'bhk', 'bath', 'balcony', 'price_per_sqft',
            'crime_score', 'entertainment_score', 'proximity_score', 'restaurant_score',
            'room_density', 'popularity_score', 'volatility_score']
X = df_features[features]
y = df_features['price'] * 1e5

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0))
]
stack_model = StackingRegressor(estimators=base_models, final_estimator=LinearRegression())
stack_model.fit(X_train, y_train)

preds = stack_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)

print(f"\n📊 Model Performance:")
print(f"MAE: ₹{mae:,.0f}")
print(f"R² Score: {r2:.3f}")


📊 Model Performance:
MAE: ₹506,501
R² Score: 0.960


In [None]:
# -----------------------------
# STEP 7: SHAP EXPLAINABILITY
# -----------------------------
explainer = shap.Explainer(stack_model.predict, X_train)
shap_values = explainer(X_test[:100])
shap.summary_plot(shap_values, X_test[:100], show=False)
plt.title("SHAP Summary Plot")
plt.tight_layout()
plt.savefig("shap_summary_plot.png")
plt.close()

PermutationExplainer explainer: 101it [02:23,  1.45s/it]
  shap.summary_plot(shap_values, X_test[:100], show=False)


In [None]:
# -----------------------------
# STEP 8: TIME SERIES (PHASE 6)
# -----------------------------
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['year'] = df['Date'].dt.year
    yearly = df.groupby('year')['price'].mean().reset_index()
    fig = px.line(yearly, x='year', y='price', title='Average Price Trend Over Years')
    fig.write_html("price_trend.html")

In [None]:
# -----------------------------
# STEP 9: EXPORT ENRICHED DATA
# -----------------------------
df_features.to_csv("real_estate_enriched_v2.csv", index=False)
files.download("real_estate_enriched_v2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# -----------------------------
# STEP 10: GRADIO DASHBOARD
# -----------------------------
def predict_all(location, sqft, bhk, bath, balcony):
    try:
        df_loc = df_features[df_features['location'] == location]
        if df_loc.empty:
            avg_pps = df_features['price_per_sqft'].mean()
            avg_crime = df_features['crime_score'].mean()
            avg_ent = df_features['entertainment_score'].mean()
            avg_prox = df_features['proximity_score'].mean()
            avg_rest = df_features['restaurant_score'].mean()
            avg_density = df_features['room_density'].mean()
            avg_pop = df_features['popularity_score'].mean()
            avg_vol = df_features['volatility_score'].mean()
            location_avg_price = df_features['price'].mean() * 1e5
        else:
            avg_pps = df_loc['price_per_sqft'].mean()
            avg_crime = df_loc['crime_score'].mean()
            avg_ent = df_loc['entertainment_score'].mean()
            avg_prox = df_loc['proximity_score'].mean()
            avg_rest = df_loc['restaurant_score'].mean()
            avg_density = df_loc['room_density'].mean()
            avg_pop = df_loc['popularity_score'].mean()
            avg_vol = df_loc['volatility_score'].mean()
            location_avg_price = df_loc['price'].mean() * 1e5

        input_df = pd.DataFrame([[sqft, bhk, bath, balcony, avg_pps, avg_crime, avg_ent, avg_prox, avg_rest,
                                  sqft / (bhk + bath), avg_pop, avg_vol]], columns=features)
        predicted_price = stack_model.predict(input_df)[0]
        status = "✅ Fairly Priced" if abs(predicted_price - location_avg_price) < 0.2 * location_avg_price else "⚠️ Overpriced"

        fig = px.bar(
            x=['Crime', 'Entertainment', 'Proximity', 'Restaurant', 'Room Density', 'Popularity', 'Volatility'],
            y=[avg_crime, avg_ent, avg_prox, avg_rest, avg_density, avg_pop, avg_vol],
            labels={'x': 'Factor', 'y': 'Score'},
            title=f"Area Score Breakdown: {location}"
        )

        return (
            f"₹{int(predicted_price):,}",
            round(avg_crime, 2),
            round(avg_ent, 2),
            round(avg_prox, 2),
            round(avg_rest, 2),
            round(avg_density, 2),
            round(avg_pop, 2),
            round(avg_vol, 2),
            status,
            fig
        )
    except Exception as e:
        return ("Error", "Error", "Error", "Error", "Error", "Error", "Error", "Error", str(e), None)

unique_locations = sorted(df_features['location'].unique())
demo = gr.Interface(
    fn=predict_all,
    inputs=[
        gr.Dropdown(choices=unique_locations, label="Location / Area"),
        gr.Number(label="Total Square Feet"),
        gr.Number(label="BHK"),
        gr.Number(label="Bathrooms"),
        gr.Number(label="Balconies")
    ],
    outputs=[
        gr.Textbox(label="Estimated Price (₹)"),
        gr.Textbox(label="Crime Score"),
        gr.Textbox(label="Entertainment Score"),
        gr.Textbox(label="Proximity Score"),
        gr.Textbox(label="Restaurant Score"),
        gr.Textbox(label="Room Density"),
        gr.Textbox(label="Popularity Score"),
        gr.Textbox(label="Volatility Score"),
        gr.Textbox(label="Pricing Status"),
        gr.Plot(label="Score Breakdown")
    ],
    title="🏡 PropVista: 360° Real Estate Analytics",
    description="Enter your location and property specs to get price, safety, and livability insights."
)
demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e682f52ae52ad71cf5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


