In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
df = pd.read_csv('mumbai_flats.csv')
print(f"Raw Data: {df.shape[0]} rows × {df.shape[1]} columns")
df.head()

Raw Data: 9955 rows × 18 columns


Unnamed: 0,flat_type,price,address,buildup_area,avg_price_sqft,age_of_property,possesion_status,which_floor,facing,furnicing,project_name,brokrage,carpet_area,bedrooms,bathrooms,balcony,parking,about_property
0,3 BHK Flat,3.95 Cr,"Runwal Forest Tower 5 To 8, Kanjurmarg West, M...",1550 sq.ft,₹25.48 K/sq.ft,3 Year Old,Ready to move,Higherof 50 floors,Fully Furnished,,Runwal Forest Tower 5 To 8,4.0 LacsAccess Zero Brokerage Properties,1130 sq.ft,3,2,1,2 Covered Parking,Exclusive 3 bhk with 3 bathroom apartment ava...
1,1 BHK Flat,19.0 L,"Ananta Vardhman Nagar, Tembhode, Palghar, Mumbai",630 sq.ft,₹3.02 K/sq.ft,3 Year Old,Ready to move,Middleof 4 floors,Unfurnished,,Ananta Vardhman Nagar,No ChargeAccess Zero Brokerage Properties,390 sq.ft,1,1,2,1 Open Parking,"1 BHK Flat for sale in Palghar, Mumbai - conta..."
2,1 BHK Flat,73.0 L,"JP North Barcelona, Kashimira, Mira Road East,...",580 sq.ft,₹12.59 K/sq.ft,1 Years Old,Ready to move,Lowerof 25 floors,Semi Furnished,,JP North Barcelona,1.5 LacsAccess Zero Brokerage Properties,475 sq.ft,1,2,2,No Parking,1 BHK Flat for sale in Mumbai. This property i...
3,1 BHK Flat,1.1 Cr,"Chandak Nischay, Ambawadi, Dahisar East, Mumbai",410 sq.ft,₹26.83 K/sq.ft,1 Years Old,Ready to move,Higherof 35 floors,Semi Furnished,,Chandak Nischay,1.1 LacsAccess Zero Brokerage Properties,368 sq.ft,1,2,1,2 Covered Parking,Check out this 1 BHK Flat for sale in Dahisar ...
4,3 BHK Flat,2.25 Cr,"Neelam Solstice Phase I, Nalanda Nagar, Ramaba...",1052 sq.ft,₹21.39 K/sq.ft,1 Years Old,Ready to move,Lowerof 26 floors,Semi Furnished,,Neelam Solstice Phase I,2.1 LacsAccess Zero Brokerage Properties,915 sq.ft,3,2,3,1 Covered Parking,"Situated in ghatkopar east, central mumbai sub..."


In [26]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9955 entries, 0 to 9954
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   flat_type         9955 non-null   object
 1   price             9954 non-null   object
 2   address           9955 non-null   object
 3   buildup_area      9955 non-null   object
 4   avg_price_sqft    9955 non-null   object
 5   age_of_property   9955 non-null   object
 6   possesion_status  9954 non-null   object
 7   which_floor       9907 non-null   object
 8   facing            7972 non-null   object
 9   furnicing         1 non-null      object
 10  project_name      9954 non-null   object
 11  brokrage          9954 non-null   object
 12  carpet_area       9954 non-null   object
 13  bedrooms          9954 non-null   object
 14  bathrooms         9819 non-null   object
 15  balcony           9740 non-null   object
 16  parking           9954 non-null   object
 17  about_property

flat_type              0
price                  1
address                0
buildup_area           0
avg_price_sqft         0
age_of_property        0
possesion_status       1
which_floor           48
facing              1983
furnicing           9954
project_name           1
brokrage               1
carpet_area            1
bedrooms               1
bathrooms            136
balcony              215
parking                1
about_property         1
dtype: int64

In [None]:
df = df.drop(['furnicing', 'about_property', 'project_name', 'brokrage'], axis=1, errors='ignore')


def convert_price(p):
    if pd.isna(p): return np.nan
    p = str(p).upper().replace('₹', '').replace(',', '').strip()
    if 'CR' in p:
        return float(p.replace('CR', '')) * 10000000
    elif 'L' in p or 'LAC' in p:
        return float(p.replace('L', '').replace('LAC', '')) * 100000
    else:
        try: return float(p)
        except: return np.nan
df['price'] = df['price'].apply(convert_price)


def clean_area(x):
    if pd.isna(x): return np.nan
    x = str(x).replace('sq.ft', '').replace(',', '').strip()
    if x in ['-', 'NA', 'N/A', '']: return np.nan
    try: return float(x)
    except: return np.nan
df['carpet_area_sqft'] = df['carpet_area'].apply(clean_area)


df['bhk'] = df['flat_type'].str.extract(r'(\d+)').astype(float)
df['bathrooms'] = pd.to_numeric(df['bathrooms'], errors='coerce')


def safe_extract_age(age):
    if pd.isna(age): return np.nan
    text = str(age).upper()
    if any(bad in text for bad in ['PRICE','ON REQUEST','CALL','NEGOTIABLE','LUXURY','CRORE','LACS']): 
        return np.nan
    if any(new in text for new in ['NEW','READY','2024','2025','2026']): 
        return 0
    if 'YEAR' in text:
        try: return int(text.split()[0])
        except: return np.nan
    try: return int(text)
    except: return np.nan
df['age_years'] = df['age_of_property'].apply(safe_extract_age)


df['ready_to_move'] = (df['possesion_status'] == 'Ready to move').astype(int)
df['locality'] = df['address'].apply(lambda x: str(x).split(',')[-2].strip() if pd.notna(x) and ',' in str(x) else 'Unknown')

def parking_score(p):
    if pd.isna(p): return 0
    p = str(p).lower()
    return 2 if 'covered' in p else 1 if 'open' in p else 0
df['parking_score'] = df['parking'].apply(parking_score)

df['facing'] = df['facing'].fillna('Unknown')
df['facing_encoded'] = LabelEncoder().fit_transform(df['facing'])


top_loc = df['locality'].value_counts().head(25).index
df['locality_grouped'] = df['locality'].where(df['locality'].isin(top_loc), 'Other')


clean_df = df[[
    'price', 'carpet_area_sqft', 'bhk', 'bathrooms', 'age_years',
    'ready_to_move', 'parking_score', 'facing_encoded', 'locality_grouped'
]].copy()

clean_df = clean_df.dropna(subset=['price', 'carpet_area_sqft', 'bhk'])
print(f"Clean rows: {clean_df.shape[0]}")

Clean rows: 9951


In [None]:
clean_df['log_price'] = np.log1p(clean_df['price'])

final_df = pd.get_dummies(clean_df, columns=['locality_grouped'], drop_first=True)

final_df = final_df.dropna()

print(f"Final modeling data: {final_df.shape[0]} rows × {final_df.shape[1]} columns")
final_df.head()

Final modeling data: 7733 rows × 34 columns


Unnamed: 0,price,carpet_area_sqft,bhk,bathrooms,age_years,ready_to_move,parking_score,facing_encoded,log_price,locality_grouped_Andheri West,...,locality_grouped_Mulund East,locality_grouped_Mulund West,locality_grouped_Naigaon East,locality_grouped_Nalasopara West,locality_grouped_Other,locality_grouped_Powai,locality_grouped_Santacruz East,locality_grouped_Sion,locality_grouped_Vile Parle East,locality_grouped_Virar West
0,39500000.0,1130.0,3.0,2.0,3.0,1,2,1,17.491811,False,...,False,False,False,False,True,False,False,False,False,False
1,1900000.0,390.0,1.0,1.0,3.0,1,1,3,14.457365,False,...,False,False,False,False,True,False,False,False,False,False
2,7300000.0,475.0,1.0,2.0,1.0,1,0,2,15.803385,False,...,False,False,False,False,False,False,False,False,False,False
3,11000000.0,368.0,1.0,2.0,1.0,1,2,2,16.213406,False,...,False,False,False,False,True,False,False,False,False,False
4,22500000.0,915.0,3.0,2.0,1.0,1,2,2,16.929026,False,...,False,False,False,False,True,False,False,False,False,False


In [None]:
X = final_df.drop(['price', 'log_price'], axis=1)   
y = final_df['log_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")

Train: 6186 | Test: 1547


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

results = []

lr = LinearRegression()
lr.fit(X_train, y_train)
pred_lr = lr.predict(X_test)
results.append({
    'Model': 'Linear Regression',
    'R²': round(r2_score(y_test, pred_lr), 4),
    'RMSE (₹ Lakh)': round(np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(pred_lr)))/100000, 2),
    'MAE (₹ Lakh)': round(mean_absolute_error(np.expm1(y_test), np.expm1(pred_lr))/100000, 2)
})

rf = RandomForestRegressor(n_estimators=300, max_depth=25, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
results.append({
    'Model': 'Random Forest',
    'R²': round(r2_score(y_test, pred_rf), 4),
    'RMSE (₹ Lakh)': round(np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(pred_rf)))/100000, 2),
    'MAE (₹ Lakh)': round(mean_absolute_error(np.expm1(y_test), np.expm1(pred_rf))/100000, 2)
})

comparison = pd.DataFrame(results).set_index('Model')
comparison

Unnamed: 0_level_0,R²,RMSE (₹ Lakh),MAE (₹ Lakh)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear Regression,0.861,281.24,94.07
Random Forest,0.9187,242.67,65.65


In [None]:
print(f"R² Score            : {r2_score(y_test, pred_rf):.4f}")
print(f"RMSE                : ₹{np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(pred_rf)))/100000:.2f} Lakh")
print(f"Mean Absolute Error : ₹{mean_absolute_error(np.expm1(y_test), np.expm1(pred_rf))/100000:.2f} Lakh")

R² Score            : 0.9187
RMSE                : ₹242.67 Lakh
Mean Absolute Error : ₹65.65 Lakh


In [None]:
import joblib
joblib.dump(rf, 'mumbai_price_model.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')

Model saved → Ready for Streamlit deployment!
