In [1]:
import kagglehub

# Download latest version
# path = kagglehub.dataset_download("amitabhajoy/bengaluru-house-price-data")

# print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

df = pd.read_csv('Bengaluru_House_Data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [3]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [4]:
#Check Missing Values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

Cleaning Data
A.Deleting Rows
You can remove rows that have missing values. This is a good option for columns like location (1 missing) and size (16 missing), as you won't lose much of your overall dataset.

In [5]:
df.dropna(subset=['location','size'], inplace=True)
df.isnull().sum()

area_type          0
availability       0
location           0
size               0
society         5499
total_sqft         0
bath              57
balcony          593
price              0
dtype: int64

B. Deleting Columns
You can remove an entire column if it has too many missing values to be useful. Your society column, with 5502 missing values, is a perfect candidate for this.

In [6]:
df.drop('society', axis=1, inplace=True)
# axis=1 tells pandas to drop a column ( axis=0 would be for a row).
df.isnull().sum()

area_type         0
availability      0
location          0
size              0
total_sqft        0
bath             57
balcony         593
price             0
dtype: int64

Standard Deviation is a measure of spread
Low SD = Data is closely clustered
High SD = Data is dispersed over wider range of values

Filling Missing Values (Imputation)
A. For Numerical Columns (bath, balcony)
For columns with numbers, you can use a measure of central tendency.

Median: The middle value. This is the safest and most common choice, especially if the data might have outliers (extreme high or low values).

Mean: The average value. Be cautious with this if you have outliers, as they can skew the average.

In [7]:
median_bath = df['bath'].median()

df['bath'] = df['bath'].fillna(median_bath)
df.isnull().sum()

area_type         0
availability      0
location          0
size              0
total_sqft        0
bath              0
balcony         593
price             0
dtype: int64

B. For Categorical Columns (size)
For columns containing categories (like size, which might have values like '2 BHK', '3 BHK', etc.), you should use the mode.

Mode: The most frequently occurring value in the column.

In [8]:
# Find the mode of the 'size' column
# .mode() returns a Series, so we take the first item with [0]
mode_size = df['size'].mode()[0] #2 BHK
# Fill missing values in 'size' with the mode
df['size'] = df['size'].fillna(mode_size)
df.isnull().sum()

area_type         0
availability      0
location          0
size              0
total_sqft        0
bath              0
balcony         593
price             0
dtype: int64

C. Using Logic or a Constant Value
Sometimes, a missing value has an implied meaning. For your balcony column, it's possible that a missing value actually means the property has 0 balconies. If this assumption makes sense for your data, you can fill with a constant value.

In [9]:
df['balcony'] = df['balcony'].fillna(0)
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

Fixing Incorrect Data Types


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13303 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13303 non-null  object 
 1   availability  13303 non-null  object 
 2   location      13303 non-null  object 
 3   size          13303 non-null  object 
 4   total_sqft    13303 non-null  object 
 5   bath          13303 non-null  float64
 6   balcony       13303 non-null  float64
 7   price         13303 non-null  float64
dtypes: float64(3), object(5)
memory usage: 935.4+ KB


Clean the size column
The size column contains strings like '2 BHK', '3 Bedroom', etc. We only need the number. We can create a new column, let's call it bhk, to store this numeric value.

In [11]:
# The lambda function splits the string by space and takes the first part
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# We don't need the original 'size' column anymore
df = df.drop('size', axis='columns')
print(df)

                  area_type   availability                  location  \
0      Super built-up  Area         19-Dec  Electronic City Phase II   
1                Plot  Area  Ready To Move          Chikka Tirupathi   
2            Built-up  Area  Ready To Move               Uttarahalli   
3      Super built-up  Area  Ready To Move        Lingadheeranahalli   
4      Super built-up  Area  Ready To Move                  Kothanur   
...                     ...            ...                       ...   
13315        Built-up  Area  Ready To Move                Whitefield   
13316  Super built-up  Area  Ready To Move             Richards Town   
13317        Built-up  Area  Ready To Move     Raja Rajeshwari Nagar   
13318  Super built-up  Area         18-Jun           Padmanabhanagar   
13319  Super built-up  Area  Ready To Move              Doddathoguru   

      total_sqft  bath  balcony   price  bhk  
0           1056   2.0      1.0   39.07    2  
1           2600   5.0      3.0  120.00  

Clean the total_sqft column
This column is an object because it contains ranges (e.g., '1000 - 1200') and non-numeric values. We need a function to convert these ranges into a single number (like their average) and handle other variations

In [12]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

# Show rows where total_sqft is not a valid float number
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,19-Dec,Yelahanka,2100 - 2850,4.0,0.0,186.0,4
56,Built-up Area,20-Feb,Devanahalli,3010 - 3410,2.0,0.0,192.0,4
81,Built-up Area,18-Oct,Hennur Road,2957 - 3450,2.0,0.0,224.5,4
122,Super built-up Area,18-Mar,Hebbal,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,1015 - 1540,2.0,0.0,56.8,2
224,Super built-up Area,19-Dec,Devanahalli,1520 - 1740,2.0,0.0,74.82,3
410,Super built-up Area,Ready To Move,Kengeri,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,1195 - 1440,2.0,0.0,63.77,2


In [13]:
import numpy as np
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return np.nan # Return NaN for values like '34.46Sq. Meter'

# Apply the function to the total_sqft column
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Now, drop the few rows that became NaN (e.g., 'Sq. Meter')
df = df.dropna(subset=['total_sqft'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13257 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13257 non-null  object 
 1   availability  13257 non-null  object 
 2   location      13257 non-null  object 
 3   total_sqft    13257 non-null  float64
 4   bath          13257 non-null  float64
 5   balcony       13257 non-null  float64
 6   price         13257 non-null  float64
 7   bhk           13257 non-null  int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 932.1+ KB
