# **Data Cleaning**

## **Import Required Libraries**

In [54]:
import pandas as pd 
import numpy as np

## **Loading the Raw Dataset**

In [55]:
df = pd.read_csv("/workspaces/Bangladesh-Real-State-Price-Prediction/data/raw/house_price_bd.csv")

## **Handle Missing Values**

In [56]:
df.isnull().sum()

Title                  0
Bedrooms            1001
Bathrooms           1001
Floor_no             684
Occupancy_status      99
Floor_area            99
City                   0
Price_in_taka          0
Location               6
dtype: int64

In [57]:
df['Floor_no'] = pd.to_numeric(df['Floor_no'], errors='coerce')

df.fillna({
    'Bedrooms': df['Bedrooms'].median(),
    'Bathrooms': df['Bathrooms'].median(),
    'Floor_no': df['Floor_no'].median(),
    'Floor_area': df['Floor_area'].median()
}, inplace=True)

df.dropna(subset=['Location', 'City'], inplace=True)

## **Cleaning and Transform Columns**

In [58]:
df['Price_in_taka'] = df['Price_in_taka'].str.replace('৳', '').str.replace(',', '').astype(float)

## **Handle Categorical Variables**

In [59]:
df = pd.get_dummies(df, columns=['City'], drop_first=True)

In [60]:
df['Occupancy_status'] = df['Occupancy_status'].map({'vacant': 1, 'occupied': 0})

In [61]:
df.isnull().sum()

Title                     0
Bedrooms                  0
Bathrooms                 0
Floor_no                  0
Occupancy_status         94
Floor_area                0
Price_in_taka             0
Location                  0
City_cumilla              0
City_dhaka                0
City_gazipur              0
City_narayanganj-city     0
dtype: int64

In [62]:
df['Occupancy_status'].value_counts()

Occupancy_status
1.0    3762
0.0       3
Name: count, dtype: int64

In [63]:
df.fillna({'Occupancy_status': 1}, inplace=True)

In [64]:
df['Occupancy_status'].value_counts()

Occupancy_status
1.0    3856
0.0       3
Name: count, dtype: int64

In [65]:
df.isnull().sum()

Title                    0
Bedrooms                 0
Bathrooms                0
Floor_no                 0
Occupancy_status         0
Floor_area               0
Price_in_taka            0
Location                 0
City_cumilla             0
City_dhaka               0
City_gazipur             0
City_narayanganj-city    0
dtype: int64

## **Save the Cleaned Data**

In [66]:
df.to_csv("/workspaces/Bangladesh-Real-State-Price-Prediction/data/processed/cleaned_data.csv", index=False)