# Data Cleaning And Preprocessing On Real State Data Set

In [1]:
# Pandas → data manipulation, cleaning, transformation
# NumPy → numerical operations, missing value handling (NaN)

import pandas as pd
import numpy as np

In [2]:
# Reads the raw CSV file into a DataFrame
# Creates a working copy

file_path = "dataset/surat_uncleaned.csv"
df_raw = pd.read_csv(file_path)
df = df_raw.copy()

In [3]:
# Removes extra spaces
# Converts to lowercase
# Replaces spaces with underscores

df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

In [4]:
# Removes hidden characters
# Prevents duplicate categories like "East " vs "East"
# Ensures accurate grouping and filtering

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.strip()

In [5]:
# Duplicate property listings inflate:
# Property count
# Average price
# Revenue metrics

df = df.drop_duplicates()

In [6]:
# Missing categorical data is filled with "Unknown"

df["facing"] = df["facing"].fillna("Unknown")
df["furnishing"] = df["furnishing"].fillna("Unknown")
df["transaction"] = df["transaction"].fillna("Unknown")

In [7]:
# Mode = most frequent category
# Ideal for low-missing categorical columns

df["status"] = df["status"].fillna(df["status"].mode()[0])

In [8]:
# Removes commas
# Extracts numeric value
# Converts to float
# "1,250 sqft" → 1250.0

df["square_feet"] = (
    df["square_feet"]
    .str.replace(",", "", regex=False)
    .str.extract(r"(\d+\.?\d*)")
    .astype(float)
)

In [9]:
# Removes currency symbols and text
# Converts to numeric
# Invalid values become NaN safely

df["price_per_sqft"] = (
    df["price_per_sqft"]
    .str.replace("₹", "", regex=False)
    .str.replace("per sqft", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

df["price_per_sqft"] = pd.to_numeric(df["price_per_sqft"], errors="coerce")

In [10]:
# "45 Lac" → 45
# "1.2 Cr" → 120

df["price"] = (
    df["price"]
    .str.replace("₹", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

In [11]:
# Indian real estate standard
# Easy comparison
# Power BI friendly

def convert_price_to_lakhs(value):
    if "Lac" in value:
        return float(value.replace("Lac", "").strip())
    elif "Cr" in value:
        return float(value.replace("Cr", "").strip()) * 100
    else:
        return np.nan

In [12]:
# Drop old price column

df.drop(columns=["price"], inplace=True)

In [13]:
df.head()

Unnamed: 0,property_name,areawithtype,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644.0,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...",2891.0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278.0,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,3551.0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173.0,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,3800.0
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700.0,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,3966.0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250.0,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...",3600.0


In [14]:
def extract_floor_data(value):
    if pd.isna(value):
        return pd.Series([np.nan, np.nan])

    value = str(value)

    numbers = [int(x) for x in value.split() if x.isdigit()]

    if len(numbers) >= 2:
        return pd.Series([numbers[0], numbers[1]])
    elif len(numbers) == 1:
        return pd.Series([numbers[0], np.nan])
    else:
        return pd.Series([np.nan, np.nan])


In [15]:
# "Vesu Apartment" →
# area_name = Vesu
# property_type = Apartment

df[["floor_number", "total_floors"]] = df["floor"].apply(extract_floor_data)
df.drop(columns=["floor"], inplace=True)

In [16]:
df.head()

Unnamed: 0,property_name,areawithtype,square_feet,transaction,status,furnishing,facing,description,price_per_sqft,floor_number,total_floors
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644.0,New Property,Poss. by Oct '24,Unfurnished,West,"Luxury project with basement parking, Solar ro...",2891.0,5.0,10.0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278.0,New Property,Poss. by Jan '26,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,3551.0,6.0,14.0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173.0,Resale,Ready to Move,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,3800.0,5.0,13.0
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700.0,New Property,Ready to Move,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,3966.0,6.0,14.0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250.0,Orchid Fantasia,New Property,2,2,"Multistorey Apartment for Sale in Palanpur, Su...",3600.0,,


In [17]:
# Fix Invalid Facing Values

valid_facings = ["North", "South", "East", "West",
                 "North-East", "North-West",
                 "South-East", "South-West"]


In [18]:
df["facing"] = df["facing"].apply(
    lambda x: x if x in valid_facings else "Unknown"
)

In [19]:
df.facing.unique()

array(['West', 'Unknown', 'East', 'South', 'North'], dtype=object)

In [20]:
# No structural issues
# Clean data types
# Controlled missing values

print("Final Shape:", df.shape)
print("\nMissing Values:\n", df.isna().sum())
print("\nData Types:\n", df.dtypes)

Final Shape: (4416, 11)

Missing Values:
 property_name        0
areawithtype         0
square_feet          6
transaction          0
status               0
furnishing           0
facing               0
description       1331
price_per_sqft     358
floor_number       815
total_floors      1248
dtype: int64

Data Types:
 property_name      object
areawithtype       object
square_feet       float64
transaction        object
status             object
furnishing         object
facing             object
description        object
price_per_sqft    float64
floor_number      float64
total_floors      float64
dtype: object


In [21]:
df.head()

Unnamed: 0,property_name,areawithtype,square_feet,transaction,status,furnishing,facing,description,price_per_sqft,floor_number,total_floors
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644.0,New Property,Poss. by Oct '24,Unfurnished,West,"Luxury project with basement parking, Solar ro...",2891.0,5.0,10.0
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278.0,New Property,Poss. by Jan '26,Unfurnished,Unknown,2 And 3 BHK Luxurious Flat for Sell In New Alt...,3551.0,6.0,14.0
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173.0,Resale,Ready to Move,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,3800.0,5.0,13.0
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700.0,New Property,Ready to Move,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,3966.0,6.0,14.0
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250.0,Orchid Fantasia,New Property,2,Unknown,"Multistorey Apartment for Sale in Palanpur, Su...",3600.0,,


In [22]:
df_raw.head()

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,facing,description,price_per_sqft,price
0,2 BHK Apartment for Sale in Dindoli Surat,Carpet Area,644 sqft,New Property,Poss. by Oct '24,5 out of 10,Unfurnished,West,"Luxury project with basement parking, Solar ro...","₹2,891 per sqft",₹33.8 Lac
1,2 BHK Apartment for Sale in Althan Surat,Super Area,1278 sqft,New Property,Poss. by Jan '26,6 out of 14,Unfurnished,South -West,2 And 3 BHK Luxurious Flat for Sell In New Alt...,"₹3,551 per sqft",₹45.4 Lac
2,2 BHK Apartment for Sale in Pal Gam Surat,Super Area,1173 sqft,Resale,Ready to Move,5 out of 13,Semi-Furnished,East,This affordable 2 BHK flat is situated along a...,"₹3,800 per sqft",₹44.6 Lac
3,2 BHK Apartment for Sale in Jahangirabad Surat,Carpet Area,700 sqft,New Property,Ready to Move,6 out of 14,Unfurnished,East,2 BHK Flat For sell IN Jahangirabad Prime Loca...,"₹3,966 per sqft",₹47 Lac
4,"2 BHK Apartment for Sale in Orchid Fantasia, P...",Super Area,1250 sqft,Orchid Fantasia,New Property,Unfurnished,2,2,"Multistorey Apartment for Sale in Palanpur, Su...","₹3,600 per sqft",₹45 Lac
