#**Real Estate Intelligent Assistant**
# ==========================================
# Author: Syeda Madiha Mahvash
# Purpose:
# - Estimate property price
# - Classify Overpriced / Fair / Underpriced
# - Explain top influential price factors

#**Import Libraries**
 We import all the necessary Python libraries for:
# - Data handling (pandas, numpy)
# - Machine learning (sklearn)
# - Model evaluation
# - Optional: feature importance explanation
# Why: This prepares the environment for all tasks in the project.

# Load Dataset
# =========================================================
> Load the main property dataset from the unzipped folder.
> Why: This dataset contains property features and prices that will be used to train the ML model.
> Replace filename with your actual file inside real_estate_dataset folder.
#=======================================


In [20]:
from google.colab import files
files.upload()

import zipfile
with zipfile.ZipFile("archive.zip", "r") as zip_ref:
    zip_ref.extractall("real_estate_dataset")

import os
print(os.listdir("real_estate_dataset"))


Saving archive.zip to archive (3).zip
['property_unseen.json', 'property_meta.json', 'property_level.json']


In [21]:
import os
print(os.listdir("real_estate_dataset"))


['property_unseen.json', 'property_meta.json', 'property_level.json']


# ======================================================
# Data Preprocessing
# =========================================================
# We need to convert all categorical (string/object) columns to numerical values.
# Why: ML models like RandomForest cannot work with strings directly.

> Copy dataset to avoid overwriting


In [22]:
import pandas as pd

df = pd.read_json("real_estate_dataset/property_level.json")

print(df.head())
print(df.shape)


   id  type   location  carpet_area  bed_room  kitchen  living_room  \
0   1  Flat  Bangalore         2000         3        1            1   
1   2  Flat      Delhi         1500         2        1            1   
2   3  Flat    Chennai         3400         6        1            1   
3   4  Flat      Delhi         2400         4        1            1   
4   5  Flat      Delhi         1800         3        1            1   

   dining_room  toilet  balcony parking_area  floor  window_no  entrance_no  \
0            1       2        1       public      7          6            1   
1            0       1        0       public      3          5            1   
2            1       5        4       public     14         12            1   
3            1       3        2       public      9          8            1   
4            1       2        1       public      6          6            1   

  supplience home_loan  wall_thick    price  
0        Yes       Yes           6  1300000  
1     

#============Load Dataset=======

In [23]:
# Import pandas library for data handling

import pandas as pd

# Load main dataset
df = pd.read_json("real_estate_dataset/property_level.json")

# Show first 5 rows
print(df.head())

# Dataset size
print("Dataset Shape:", df.shape)

# Column names
print("Columns:", df.columns)


   id  type   location  carpet_area  bed_room  kitchen  living_room  \
0   1  Flat  Bangalore         2000         3        1            1   
1   2  Flat      Delhi         1500         2        1            1   
2   3  Flat    Chennai         3400         6        1            1   
3   4  Flat      Delhi         2400         4        1            1   
4   5  Flat      Delhi         1800         3        1            1   

   dining_room  toilet  balcony parking_area  floor  window_no  entrance_no  \
0            1       2        1       public      7          6            1   
1            0       1        0       public      3          5            1   
2            1       5        4       public     14         12            1   
3            1       3        2       public      9          8            1   
4            1       2        1       public      6          6            1   

  supplience home_loan  wall_thick    price  
0        Yes       Yes           6  1300000  
1     

#======Check Missing Values======

In [24]:

# Check for missing values in each column

print(df.isnull().sum())


id              0
type            0
location        0
carpet_area     0
bed_room        0
kitchen         0
living_room     0
dining_room     0
toilet          0
balcony         0
parking_area    0
floor           0
window_no       0
entrance_no     0
supplience      0
home_loan       0
wall_thick      0
price           0
dtype: int64


#=====Separate Features and Target=====

In [25]:
# Define features (X) and target variable (y)


X = df.drop("price", axis=1)  # drop target column

y = df["price"]     # target column

print("Features Shape:", X.shape)
print("Target Shape:", y.shape)


Features Shape: (100, 17)
Target Shape: (100,)


#=======Encode Categorical Columns==========

In [26]:
from sklearn.preprocessing import LabelEncoder


# Automatically find all categorical (object) columns
categorical_cols = ["location", "parking_area"]


# Encode all categorical columns into numeric values

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [27]:
print(df.dtypes)


id               int64
type            object
location         int64
carpet_area      int64
bed_room         int64
kitchen          int64
living_room      int64
dining_room      int64
toilet           int64
balcony          int64
parking_area     int64
floor            int64
window_no        int64
entrance_no      int64
supplience      object
home_loan       object
wall_thick       int64
price            int64
dtype: object


In [28]:
from sklearn.preprocessing import LabelEncoder

# Copy dataset
df_encoded = df.copy()

# Find all categorical columns automatically
cat_cols = df_encoded.select_dtypes(include=["object"]).columns

print("Categorical Columns:", cat_cols)

# Encode them
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

print(" All categorical columns encoded!")


Categorical Columns: Index(['type', 'supplience', 'home_loan'], dtype='object')
✅ All categorical columns encoded!


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Features and target after encoding
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest Regressor
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)


print(" Model trained successfully!")


✅ Model trained successfully!


#========Evaluate Model Performance======

In [30]:
from sklearn.metrics import mean_absolute_error, r2_score

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MAE: 339137.5
R2 Score: 0.9960750634205229


#=========Price Prediction Function======

In [31]:
def predict_price(property_data):
    """
    property_data = dictionary of property features
    """
    input_df = pd.DataFrame([property_data])

    # Encode categorical columns same way
    for col in cat_cols:
        input_df[col] = LabelEncoder().fit_transform(input_df[col])

    predicted = model.predict(input_df)[0]
    return predicted


#========Overpriced / Fair / Underpriced Classifier=====



In [48]:
def price_status(listed_price, predicted_price):
    """
    Compare listed price vs predicted price.
    Output: Status string (Overpriced / Fair / Underpriced)
    """
    diff_percent = (listed_price - predicted_price) / predicted_price * 100

    if diff_percent > 10:
        return f"Overpriced by {diff_percent:.2f}%"
    elif diff_percent < -10:
        return f"Underpriced by {abs(diff_percent):.2f}%"
    else:
        return "Fairly Priced"

#=======Feature Importance (Influential Price Factors)======



In [49]:
# Create DataFrame showing importance of each feature
import pandas as pd

importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Top Influential Factors:")
print(importance_df.head(10))


Top Influential Factors:
         Feature  Importance
12     window_no    0.285180
3    carpet_area    0.184933
9        balcony    0.182736
8         toilet    0.166897
4       bed_room    0.165287
11         floor    0.008528
2       location    0.001848
0             id    0.001366
1           type    0.000766
10  parking_area    0.000763


#======Neighborhood-Level Explanation====

**What affects price most in Jaipur?**”

*   List item
*   List item



In [52]:
def neighborhood_factors(city_name):
    """
    Provide insights about a specific neighborhood.
    Prints average price, listing count, and top price drivers.
    """
    subset = df[df["location"] == city_name]
    avg_price = subset["price"].mean()
    count = subset.shape[0]

    print(f" Neighborhood: {city_name}")
    print(f"Total Listings: {count}")
    print(f"Average Price: ₹{avg_price:,.0f}")

    print("\nTop Price Drivers Overall:")
    print(importance_df.head(5))

#========Load Unseen Properties and Predict======



In [54]:
# Load new unseen properties
unseen = pd.read_json("real_estate_dataset/property_unseen.json")
print(unseen.head())


## Remove price column if present
if "price" in unseen.columns:
    unseen = unseen.drop("price", axis=1)


# Encode categorical columns as done in training
from sklearn.preprocessing import LabelEncoder

# Define all columns that were treated as categorical during model training
# These include 'location' and 'parking_area' from cell 7RV94jd_E7Ol
# and 'type', 'supplience', 'home_loan' from 'cat_cols' in cell KOhKBRtzFW-r
all_cols_to_encode = ["location", "parking_area"] + list(cat_cols)
for col in all_cols_to_encode:
    # Only apply LabelEncoder if the column exists in the unseen data and is of object type
    if col in unseen.columns and unseen[col].dtype == 'object':
        le = LabelEncoder()
        unseen[col] = le.fit_transform(unseen[col])


# Predict prices
predicted_prices = model.predict(unseen)

unseen["predicted_price"] = predicted_prices

print(unseen.head())

   id  type location  carpet_area  bed_room  kitchen  living_room  \
0   1     1   Jaipur        15600        37        1            1   
1   2     0  Lucknow         8400        19        1            1   
2   3     1   Nagpur        16000        38        1            1   
3   4     0   Indore         8600        19        1            1   
4   5     1    Thane        16400        39        1            1   

   dining_room  toilet  balcony parking_area  floor  window_no  entrance_no  \
0            1      35       34      private     36         74            2   
1            1      18       17       public     38         36            1   
2            1      36       35      private     37         76            2   
3            1      18       17       public     39         37            1   
4            1      37       36      private     38         78            2   

   supplience  home_loan  wall_thick     price  
0           1          1           8  29000000  
1           