# Flat Price Estimation for UrbanNest Realtors
**Date:** 2025-07-05

This notebook walks through the complete machine‑learning pipeline to predict urban flat prices based on structured features.

## 1. Setup & Data Loading

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

# Adjust the path if necessary
CSV_PATH = Path('Flat Price Estimation for UrbanNest Realtors (1).csv')
df = pd.read_csv(CSV_PATH)
print(f"Dataset shape: {df.shape}")
df.head()


## 2. Data Exploration

In [None]:

df.info()
df.describe().T


### 2.1 Correlation Heatmap

In [None]:

import matplotlib.pyplot as plt

corr = df.corr(numeric_only=True)
fig, ax = plt.subplots(figsize=(6,4))
im = ax.imshow(corr, cmap='viridis')
ax.set_xticks(range(len(corr.columns)), corr.columns, rotation=45, ha='right')
ax.set_yticks(range(len(corr.index)), corr.index)
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
plt.title('Feature Correlation')
plt.tight_layout()
plt.show()


## 3. Train‑Test Split & Preprocessing

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

X = df.drop('flat_price', axis=1)
y = df['flat_price']

numeric_features = X.columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), numeric_features)]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## 4. Model Training & Evaluation

In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import math, pandas as pd

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(
        n_estimators=300, random_state=42
    ),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    pipe = Pipeline(steps=[('prep', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    results.append({
        'Model': name,
        'R2': r2_score(y_test, pred),
        'MAE': mean_absolute_error(y_test, pred),
        'RMSE': math.sqrt(mean_squared_error(y_test, pred)),
        'Pipeline': pipe
    })

results_df = pd.DataFrame(results).drop(columns=['Pipeline'])
results_df.sort_values('R2', ascending=False, inplace=True)
results_df


### 4.1 Select Best Model

In [None]:

best_row = results_df.iloc[0]
best_name = best_row['Model']
best_pipeline = [r for r in results if r['Model'] == best_name][0]['Pipeline']
print(f"Best model: {best_name} with R2 = {best_row.R2:.4f}")


## 5. Feature Importance (Tree‑based models)

In [None]:

if hasattr(best_pipeline.named_steps['model'], 'feature_importances_'):
    importances = best_pipeline.named_steps['model'].feature_importances_
    feature_names = numeric_features
    imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    imp_df.sort_values('Importance', ascending=False, inplace=True)
    imp_df.head(10)


## 6. Persist Model

In [None]:

import joblib
MODEL_PATH = Path('flat_price_model.pkl')
joblib.dump(best_pipeline, MODEL_PATH)
print(f'Model saved to {MODEL_PATH.resolve()}')


## 7. Quick Prediction Example

In [None]:

sample = X_test.iloc[:5]
preds = best_pipeline.predict(sample)
pd.DataFrame({'Actual': y_test.iloc[:5], 'Predicted': preds})


## 8. (Optional) Streamlit App Snippet


```python
# Save this as app.py
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('flat_price_model.pkl')

st.title('Urban Flat Price Estimator')

# Create input widgets
area = st.number_input('Area (sqft)', 300, 5000, step=50)
bedrooms = st.slider('Bedrooms', 1, 6, 2)
distance = st.number_input('Distance to Metro (km)', 0.0, 20.0, step=0.1)
age = st.slider('Age of Flat (years)', 0, 50, 5)
amenities = st.slider('Amenities Score (0-10)', 0.0, 10.0, step=0.1)

if st.button('Estimate Price'):
    X_new = pd.DataFrame([{
        'area_sqft': area,
        'bedrooms': bedrooms,
        'distance_to_metro_km': distance,
        'age_of_flat_years': age,
        'amenities_score': amenities
    }])
    price = model.predict(X_new)[0]
    st.success(f'Estimated Price: ₹{price:,.0f}')
```
Run with:
```bash
streamlit run app.py
```
