In [None]:
import kagglehub

# Download latest version``
# path = kagglehub.dataset_download("amitabhajoy/bengaluru-house-price-data")

# print("Path to dataset files:", path)

: 

In [None]:
import pandas as pd

df = pd.read_csv('Bengaluru_House_Data.csv')

df.info()

In [None]:
df.describe()

In [None]:
#Check Missing Values
df.isnull().sum()

Cleaning Data
A.Deleting Rows
You can remove rows that have missing values. This is a good option for columns like location (1 missing) and size (16 missing), as you won't lose much of your overall dataset.

In [None]:
df.dropna(subset=['location','size'], inplace=True)
df.isnull().sum()

B. Deleting Columns
You can remove an entire column if it has too many missing values to be useful. Your society column, with 5502 missing values, is a perfect candidate for this.

In [None]:
df.drop('society', axis=1, inplace=True)
# axis=1 tells pandas to drop a column ( axis=0 would be for a row).
df.isnull().sum()

Standard Deviation is a measure of spread
Low SD = Data is closely clustered
High SD = Data is dispersed over wider range of values

Filling Missing Values (Imputation)
A. For Numerical Columns (bath, balcony)
For columns with numbers, you can use a measure of central tendency.

Median: The middle value. This is the safest and most common choice, especially if the data might have outliers (extreme high or low values).

Mean: The average value. Be cautious with this if you have outliers, as they can skew the average.

In [None]:
median_bath = df['bath'].median()

df['bath'] = df['bath'].fillna(median_bath)
df.isnull().sum()

B. For Categorical Columns (size)
For columns containing categories (like size, which might have values like '2 BHK', '3 BHK', etc.), you should use the mode.

Mode: The most frequently occurring value in the column.

In [None]:
# Find the mode of the 'size' column
# .mode() returns a Series, so we take the first item with [0]
mode_size = df['size'].mode()[0] #2 BHK
# Fill missing values in 'size' with the mode
df['size'] = df['size'].fillna(mode_size)
df.isnull().sum()

C. Using Logic or a Constant Value
Sometimes, a missing value has an implied meaning. For your balcony column, it's possible that a missing value actually means the property has 0 balconies. If this assumption makes sense for your data, you can fill with a constant value.

In [None]:
df['balcony'] = df['balcony'].fillna(0)
df.isnull().sum()

Fixing Incorrect Data Types


In [None]:
df.info()

Clean the size column
The size column contains strings like '2 BHK', '3 Bedroom', etc. We only need the number. We can create a new column, let's call it bhk, to store this numeric value.

In [None]:
# The lambda function splits the string by space and takes the first part
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# We don't need the original 'size' column anymore
df = df.drop('size', axis='columns')
print(df)

Clean the total_sqft column
This column is an object because it contains ranges (e.g., '1000 - 1200') and non-numeric values. We need a function to convert these ranges into a single number (like their average) and handle other variations

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

# Show rows where total_sqft is not a valid float number
df[~df['total_sqft'].apply(is_float)].head(10)

In [None]:
import numpy as np
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return np.nan # Return NaN for values like '34.46Sq. Meter'

# Apply the function to the total_sqft column
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Now, drop the few rows that became NaN (e.g., 'Sq. Meter')
df = df.dropna(subset=['total_sqft'])

In [None]:
df.info()

In [None]:
df.to_csv('cleaned_bengalur.csv')