In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("Makaan_Properties_Buy.csv", encoding='ISO-8859-1')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.drop(['Listing_Category','builder_id','Property_Name'], axis=1, inplace=True)

In [None]:
df['Property_status'].fillna(df['Property_status'].mode()[0], inplace=True)
df['Locality_Name'].fillna('Unknown', inplace=True)
df['description'].fillna('No Description', inplace=True)
df['Builder_name'].fillna('Unknown', inplace=True)

In [None]:
df.shape

In [None]:
df['Builder_name'].isnull().sum()

In [None]:
df['Price'] = df['Price'].str.replace(',', '').astype(float)

In [None]:
df['Price']

In [None]:
filtered_df = df[df['Price'] < df['Price'].quantile(0.99)]

plt.figure(figsize=(10, 6))
sns.histplot(filtered_df['Price'], kde=True, bins=50)
plt.title("Distribution of Property Prices (Filtered)")
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=filtered_df['Price'])
plt.title('Boxplot of Property Prices (After Outlier Removal)')
plt.xlabel('Price')
plt.show()

In [None]:
categorical_cols = filtered_df.select_dtypes(include=['object', 'bool']).columns
print(categorical_cols)

In [None]:
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(filtered_df[col].value_counts())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
sns.countplot(data=filtered_df, x='Property_type', order=filtered_df['Property_type'].value_counts().index)
plt.xticks(rotation=45)
plt.title('Distribution of Property Type')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=filtered_df, x='is_furnished', y='Price')
plt.title('Furnishing vs. Price')
plt.show()

In [None]:
filtered_df.drop(['Project_URL','description'],axis=1,inplace=True)

In [None]:
filtered_df.select_dtypes(include='object').nunique().sort_values(ascending=False)

In [None]:
filtered_df['Price_per_unit_area']

In [None]:
filtered_df['Price_per_unit_area'] = filtered_df['Price_per_unit_area'].str.replace(',', '').astype(float)

In [None]:
filtered_df['Builder_name'].value_counts().head(10)

In [None]:
sns.countplot(y=filtered_df['Property_type'], order=filtered_df['Property_type'].value_counts().index)

In [None]:
corr = filtered_df.corr(numeric_only=True)
plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
sns.scatterplot(data=filtered_df, x='Longitude', y='Latitude', hue='Price', palette='viridis', alpha=0.6)

In [None]:
filtered_df = filtered_df[
    (filtered_df['Longitude'].between(68, 97)) &  # India longitudes roughly 68–97°E
    (filtered_df['Latitude'] .between(6, 37))     # India latitudes roughly 6–37°N
].reset_index(drop=True)

In [None]:
sns.scatterplot(data=filtered_df, x='Longitude', y='Latitude', hue='Price', palette='viridis', alpha=0.6)

In [None]:
filtered_df[['Size', 'Price', 'Price_per_unit_area']].hist(figsize=(12, 5))

In [None]:
filtered_df['No_of_BHK']

In [None]:
filtered_df['No_of_BHK'] = filtered_df['No_of_BHK'].str.extract('(\d+)').astype(float)

In [None]:
filtered_df['Size']

In [None]:
filtered_df['Size'] = filtered_df['Size'].str.replace(',', '').str.extract('(\d+)').astype(float)

In [None]:
filtered_df['Posted_On']

In [None]:
def convert_posted_on(value):
    try:
        num, unit, *_ = value.split()
        num = int(num)
        if 'day' in unit:
            return num
        elif 'week' in unit:
            return num * 7
        elif 'month' in unit:
            return num * 30
        elif 'year' in unit:
            return num * 365
    except:
        return np.nan

filtered_df['Posted_days_ago'] = filtered_df['Posted_On'].apply(convert_posted_on)


In [None]:
filtered_df.drop('Posted_On', axis=1, inplace=True)

In [None]:
filtered_df['Posted_days_ago']

In [None]:
categorical_cols = filtered_df.select_dtypes(include=['object', 'bool']).columns
print(categorical_cols)

In [None]:
filtered_df.select_dtypes(include='object').nunique().sort_values(ascending=False)

In [None]:
features = ['City_name', 'Property_type', 'Property_status', 'Sub_urban_name',
            'is_furnished', 'is_ready_to_move', 'No_of_BHK', 'Size', 'Latitude',
                'Longitude','Price_per_unit_area']
target = 'Price'

In [None]:
filtered_df['No_of_BHK'] = filtered_df['No_of_BHK'].astype(int)
filtered_df['No_of_BHK'] = filtered_df['No_of_BHK'].astype(str)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X = filtered_df[features]
y = filtered_df[target]

cat_features = ['City_name', 'Property_type', 'Property_status', 'Sub_urban_name',
                'is_furnished', 'is_ready_to_move', 'No_of_BHK']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
column_indices = {name: idx for idx, name in enumerate(X_train.columns)}
print("Column Index Map:", column_indices)

print("cat_features resolved indices:", [column_indices[c] for c in cat_features])

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,
    depth=8,
    learning_rate=0.1,
    loss_function='RMSE',
    cat_features=cat_features,
    verbose=100,
    random_state=42
)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: ₹{mae:,.2f}")
print(f"RMSE: ₹{rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"MAPE: {mape:.2f}%")

In [None]:
from sklearn.metrics import median_absolute_error

medae = median_absolute_error(y_test, y_pred)
print(f"Median AE: ₹{medae:,.2f}")

In [None]:
tolerance = 0.2  
within_range = np.mean(np.abs((y_test - y_pred) / y_test) <= tolerance)
print(f"Within 20% range: {within_range*100:.2f}%")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5, color='teal')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') 
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Prices")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(8,6))
plt.scatter(y_pred, residuals, alpha=0.4)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Price")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted")
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = filtered_df[['Price', 'Size', 'Price_per_unit_area']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
import pandas as pd

clean_df = filtered_df[['Size', 'Price']].copy()
clean_df = clean_df[pd.to_numeric(clean_df['Size'], errors='coerce').notna()]
clean_df = clean_df[clean_df['Price'].notna()]

clean_df['Size'] = clean_df['Size'].astype(float)

grouped = clean_df.groupby('Size')['Price'].mean().reset_index()
grouped = grouped.sort_values('Size')

grouped.to_json('price_vs_size.json', orient='records', indent=2)

In [None]:
filtered_df['Price_per_unit_area']

In [None]:
import json
avg_price_map = (
    filtered_df.groupby('Sub_urban_name')['Price_per_unit_area']
    .mean()
    .round(2)
    .dropna()
    .to_dict()
)

with open('avg_price_per_area.json', 'w') as f:
    json.dump(avg_price_map, f, indent=2)

print("avg_price_per_area.json created ✅")

In [None]:
model.save_model("catboost_model.cbm")

In [None]:
filtered_df.columns

In [None]:
compare_cols = [
    'Size', 'Price', 'Price_per_unit_area', 'No_of_BHK',
    'City_name', 'Property_type', 'Property_status',
    'is_furnished', 'is_ready_to_move'
]

compare_df = filtered_df[compare_cols].dropna()

compare_df['No_of_BHK'] = compare_df['No_of_BHK'].astype(str).str.extract(r'(\d+)').astype(float)

compare_df.to_json('compare_data.json', orient='records')

In [None]:
explorer_cols = [
    'Property_id', 'City_id', 'City_name', 'Locality_ID', 'Locality_Name',
    'Sub_urban_ID', 'Sub_urban_name', 'Size', 'Price', 'No_of_BHK',
    'Property_type', 'Property_status', 'is_furnished', 'is_ready_to_move'
]

filtered_df[explorer_cols].to_json('explorer_data.json', orient='records')

In [None]:
form_options = {
    'City_name': sorted(filtered_df['City_name'].dropna().unique().tolist()),
    'Property_type': sorted(filtered_df['Property_type'].dropna().unique().tolist()),
    'Property_status': sorted(filtered_df['Property_status'].dropna().unique().tolist()),
    'is_furnished': sorted(filtered_df['is_furnished'].dropna().unique().tolist()),
    'Sub_urban_name': sorted(filtered_df['Sub_urban_name'].dropna().unique().tolist())
}

import json
with open('form_options.json', 'w') as f:
    json.dump(form_options, f, indent=2)

In [None]:
map_cols = [
    'Property_id', 'Price', 'City_name', 'Property_type',
    'Locality_Name', 'Latitude', 'Longitude'
]

filtered_df[map_cols].dropna(subset=['Latitude', 'Longitude']).to_json('map_data.json', orient='records')

In [None]:
price_by_type = (
    filtered_df.groupby('Property_type')['Price']
    .mean()
    .reset_index()
    .rename(columns={'Property_type': 'Property_type', 'Price': 'Price'})
    .to_dict(orient='records')
)

with open('price_by_type.json', 'w') as f:
    json.dump(price_by_type, f, indent=2)

In [None]:
bins = pd.cut(filtered_df['Size'], bins=15)
grouped = filtered_df.groupby(bins)['Price'].mean().reset_index()

price_vs_size = []
for row in grouped.itertuples(index=False):
    bin_mid = row[0].mid if hasattr(row[0], 'mid') else (row[0].left + row[0].right) / 2
    price_vs_size.append({
        'Size': round(bin_mid, 2),
        'Price': round(row[1], 2)
    })

with open('price_vs_size.json', 'w') as f:
    json.dump(price_vs_size, f, indent=2)

In [None]:
top_builders = (
    filtered_df[filtered_df['Builder_name'].notna()]
    .groupby('Builder_name')['Price']
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
    .rename(columns={'Price': 'Price'})
    .to_dict(orient='records')
)

with open('top_builders.json', 'w') as f:
    json.dump(top_builders, f, indent=2)

In [None]:
suburb_coords = (
    filtered_df[['Sub_urban_name', 'Latitude', 'Longitude']]
    .dropna()
    .drop_duplicates(subset='Sub_urban_name')
    .set_index('Sub_urban_name')
    .to_dict(orient='index')
)

with open('suburban_map.json', 'w') as f:
    json.dump(suburb_coords, f, indent=2)

In [None]:
df = df.dropna(subset=["City_name", "Sub_urban_name"])

suburbs_by_city = {}

for city in sorted(df["City_name"].unique()):
    suburbs = (
        df[df["City_name"] == city]["Sub_urban_name"]
        .dropna()
        .unique()
        .tolist()
    )
    suburbs_by_city[city] = sorted(suburbs)

with open("suburbs_by_city.json", "w") as f:
    json.dump(suburbs_by_city, f, indent=2)