In [4]:
import pandas as pd

# Load the combined dataset
df = pd.read_csv("../data/processed/combined_data.csv")

# Basic preview
df.head()


Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,isNegotiable,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400 sq ft,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,,,,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400 sq ft,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,,,,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500 sq ft,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,,,,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,"1,020 sq ft",Model Town,Delhi,28.712898,77.18,48000,INR,3.0,,,,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810 sq ft,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,,,,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi


In [5]:
df.shape

(13910, 17)

In [6]:
df.columns.tolist()

['house_type',
 'house_size',
 'location',
 'city',
 'latitude',
 'longitude',
 'price',
 'currency',
 'numBathrooms',
 'numBalconies',
 'isNegotiable',
 'priceSqFt',
 'verificationDate',
 'description',
 'SecurityDeposit',
 'Status',
 'City']

In [7]:
df.isnull().sum()

house_type              0
house_size              0
location                0
city                    0
latitude                0
longitude               0
price                   0
currency                0
numBathrooms           56
numBalconies         8619
isNegotiable        12634
priceSqFt           13910
verificationDate        0
description           831
SecurityDeposit         0
Status                  0
City                    0
dtype: int64

#### house_type              0
#### house_size              0
#### location                0
#### city                    0
#### latitude                0
#### longitude               0
#### price                   0
#### currency                0
#### numBathrooms           56
#### numBalconies         8619
#### isNegotiable        12634
#### priceSqFt           13910
#### verificationDate        0
#### description           831
#### SecurityDeposit         0
#### Status                  0
#### City                    0


In [8]:
#### numBathrooms           56      int 
#### numBalconies         8619      int 
#### isNegotiable        12634      bool
#### priceSqFt           13910      float64
#### description         831        object



# Gonna fill the numBathrooms with median of the cols
df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)


In [9]:
# Now fill the values of numBalconies with 0 as because the it has near about 50% data missing
df['numBalconies'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBalconies'].fillna(0, inplace=True)


In [10]:
# We will drop this column as it has more than 80% percentage of missing values so it will be removed 
df.drop(columns=['isNegotiable'], inplace=True)


In [89]:
# Step 1: Remove commas and 'sq ft', then strip whitespace
df['house_size'] = df['house_size'].str.replace(',', '')
df['house_size'] = df['house_size'].str.replace('sq ft', '')
df['house_size'] = df['house_size'].str.strip()

# Step 2: Convert to numeric
df['house_size'] = pd.to_numeric(df['house_size'], errors='coerce')



df['priceSqFt'] = df['price'] / df['house_size']


In [None]:
df.house_size.head( )           

0      400 sq ft
1      400 sq ft
2      500 sq ft
3    1,020 sq ft
4      810 sq ft
Name: house_size, dtype: object

In [14]:
df.price.head( )

0    22000
1    20000
2     8500
3    48000
4    20000
Name: price, dtype: int64

In [91]:
df['description'].fillna("No description available", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna("No description available", inplace=True)


In [92]:
df.isnull().sum()

house_type          0
house_size          0
location            0
city                0
latitude            0
longitude           0
price               0
currency            0
numBathrooms        0
numBalconies        0
priceSqFt           0
verificationDate    0
description         0
SecurityDeposit     0
Status              0
City                0
dtype: int64

In [93]:
df.head()

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,0.0,55.0,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,0.0,50.0,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,0.0,17.0,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,1020,Model Town,Delhi,28.712898,77.18,48000,INR,3.0,0.0,47.058824,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,0.0,24.691358,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi
