## Pre Processing and EDA

In [2]:
import pandas as pd

# Load the combined dataset
df = pd.read_csv("../data/processed/combined_data.csv")

# Basic preview
df.head()


Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,isNegotiable,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400 sq ft,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,,,,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400 sq ft,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,,,,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500 sq ft,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,,,,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,"1,020 sq ft",Model Town,Delhi,28.712898,77.18,48000,INR,3.0,,,,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810 sq ft,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,,,,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi


In [3]:
df.shape

(13910, 17)

In [4]:
df.columns.tolist()

['house_type',
 'house_size',
 'location',
 'city',
 'latitude',
 'longitude',
 'price',
 'currency',
 'numBathrooms',
 'numBalconies',
 'isNegotiable',
 'priceSqFt',
 'verificationDate',
 'description',
 'SecurityDeposit',
 'Status',
 'City']

In [5]:
df.isnull().sum()

house_type              0
house_size              0
location                0
city                    0
latitude                0
longitude               0
price                   0
currency                0
numBathrooms           56
numBalconies         8619
isNegotiable        12634
priceSqFt           13910
verificationDate        0
description           831
SecurityDeposit         0
Status                  0
City                    0
dtype: int64

#### house_type              0
#### house_size              0
#### location                0
#### city                    0
#### latitude                0
#### longitude               0
#### price                   0
#### currency                0
#### numBathrooms           56
#### numBalconies         8619
#### isNegotiable        12634
#### priceSqFt           13910
#### verificationDate        0
#### description           831
#### SecurityDeposit         0
#### Status                  0
#### City                    0


In [6]:
#### numBathrooms           56      int 
#### numBalconies         8619      int 
#### isNegotiable        12634      bool
#### priceSqFt           13910      float64
#### description         831        object



# Gonna fill the numBathrooms with median of the cols
df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)


In [7]:
# Now fill the values of numBalconies with 0 as because the it has near about 50% data missing
df['numBalconies'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBalconies'].fillna(0, inplace=True)


In [8]:
# We will drop this column as it has more than 80% percentage of missing values so it will be removed 
df.drop(columns=['isNegotiable'], inplace=True)


In [9]:
# Step 1: Remove commas and 'sq ft', then strip whitespace
df['house_size'] = df['house_size'].str.replace(',', '')
df['house_size'] = df['house_size'].str.replace('sq ft', '')
df['house_size'] = df['house_size'].str.strip()

# Step 2: Convert to numeric
df['house_size'] = pd.to_numeric(df['house_size'], errors='coerce')



df['priceSqFt'] = df['price'] / df['house_size']


In [10]:
df.house_size.head( )           

0     400
1     400
2     500
3    1020
4     810
Name: house_size, dtype: int64

In [11]:
df.price.head( )

0    22000
1    20000
2     8500
3    48000
4    20000
Name: price, dtype: int64

In [12]:
df['description'].fillna("No description available", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna("No description available", inplace=True)


In [13]:
df.isnull().sum()

house_type          0
house_size          0
location            0
city                0
latitude            0
longitude           0
price               0
currency            0
numBathrooms        0
numBalconies        0
priceSqFt           0
verificationDate    0
description         0
SecurityDeposit     0
Status              0
City                0
dtype: int64

In [14]:
df.head()

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,0.0,55.0,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,0.0,50.0,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,0.0,17.0,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,1020,Model Town,Delhi,28.712898,77.18,48000,INR,3.0,0.0,47.058824,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,0.0,24.691358,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi


In [15]:
# Show all columns with object (non-numeric) datatype
categorical_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical columns:", categorical_cols)

# Show unique values for each to inspect what needs encoding
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(df[col].unique()[:10])  # Show up to 10 unique values for preview


Categorical columns: ['house_type', 'location', 'city', 'currency', 'verificationDate', 'description', 'SecurityDeposit', 'Status', 'City']

Column: house_type
['1 RK Studio Apartment ' '2 BHK Independent Floor '
 '3 BHK Independent House ' '2 BHK Apartment ' '3 BHK Apartment '
 '3 BHK Independent Floor ' '4 BHK Independent Floor '
 '1 BHK Independent Floor ' '1 BHK Apartment ' '8 BHK Independent Floor ']

Column: location
['Kalkaji' 'Mansarover Garden' 'Uttam Nagar' 'Model Town'
 'Sector 13 Rohini' 'DLF Farms' 'laxmi nagar' 'Swasthya Vihar' 'Janakpuri'
 'Pitampura']

Column: city
['Delhi' 'Mumbai' 'Hisar' 'Pune']

Column: currency
['INR']

Column: verificationDate
['Posted a day ago' 'Posted 9 days ago' 'Posted 12 days ago'
 'Posted a year ago' 'Posted 2 years ago' 'Posted 3 years ago'
 'Posted 2 months ago' 'Posted a month ago' 'Posted 17 days ago'
 'Posted 13 days ago']

Column: description
['Fully furnished, loaded with amenities & gadgets- 1RK + lobby Set, with all facilities, Par

In [16]:
df.info()
print("\n\nSample rows:")
display(df.sample(5))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910 entries, 0 to 13909
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        13910 non-null  object 
 1   house_size        13910 non-null  int64  
 2   location          13910 non-null  object 
 3   city              13910 non-null  object 
 4   latitude          13910 non-null  float64
 5   longitude         13910 non-null  float64
 6   price             13910 non-null  int64  
 7   currency          13910 non-null  object 
 8   numBathrooms      13910 non-null  float64
 9   numBalconies      13910 non-null  float64
 10  priceSqFt         13910 non-null  float64
 11  verificationDate  13910 non-null  object 
 12  description       13910 non-null  object 
 13  SecurityDeposit   13910 non-null  object 
 14  Status            13910 non-null  object 
 15  City              13910 non-null  object 
dtypes: float64(5), int64(2), object(9)
memor

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
13787,2 BHK Apartment,1000,Hinjewadi,Pune,18.583963,73.697685,17000,INR,2.0,0.0,17.0,Posted 3 years ago,A 2 bhk property is available for rental in Pe...,No Deposit,Unfurnished,Pune
2938,2 BHK Apartment,800,Janakpuri,Delhi,28.624125,77.081459,18000,INR,1.0,1.0,22.5,Posted 16 days ago,An independent lig flat very beautifully renov...,36000,Semi-Furnished,Delhi
7535,1 BHK Apartment,640,Koper Khairane,Mumbai,19.102783,73.010506,27000,INR,1.0,0.0,42.1875,Posted 16 days ago,A spacious 1 bhk multistorey apartment is avai...,No Deposit,Furnished,Mumbai
1785,2 BHK Independent Floor,540,Dabri,Delhi,28.607269,77.089447,12000,INR,1.0,0.0,22.222222,Posted 15 days ago,Your search for a spacious home at affordable ...,No Deposit,Semi-Furnished,Delhi
8725,2 BHK Apartment,800,Virar,Mumbai,19.462004,72.801643,10000,INR,2.0,0.0,12.5,Posted 4 years ago,A spacious 2 bhk multistorey apartment is avai...,No Deposit,Semi-Furnished,Mumbai


In [17]:
# Standardize and compare the two columns
df['city'] = df['city'].str.lower().str.strip()
df['City'] = df['City'].str.lower().str.strip()

# Drop 'City' if both are the same
if (df['city'] == df['City']).all():
    df.drop('City', axis=1, inplace=True)
    print("✅ Dropped 'City' column as it duplicates 'city'")
else:
    print("'city' and 'City' contain mismatched values. Manual inspection needed.")


'city' and 'City' contain mismatched values. Manual inspection needed.


In [18]:
mismatch = df[df['city'].str.lower().str.strip() != df['City'].str.lower().str.strip()]
display(mismatch[['city', 'City']])
print(f"⚠️ Total mismatches found: {len(mismatch)}")


Unnamed: 0,city,City
7724,hisar,mumbai
7842,hisar,mumbai
8548,hisar,mumbai
8559,hisar,mumbai
8849,hisar,mumbai
8945,hisar,mumbai
9409,hisar,mumbai
9420,hisar,mumbai


⚠️ Total mismatches found: 8


In [19]:
# List of index numbers to update
indices_to_update = [7724, 7842, 8548, 8559, 8849, 8945, 9409, 9420]

# Update 'City' column to 'Delhi' for these indices
df.loc[indices_to_update, 'City'] = 'Delhi'

# Drop the 'city' column
df.drop(columns=['city'], inplace=True)


In [20]:
for col in ['house_type', 'location', 'currency', 'Status', 'City']:
    print(f"{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts().head(), '\n')


house_type: 34 unique values
house_type
2 BHK Apartment             4385
1 BHK Apartment             2622
3 BHK Apartment             1837
3 BHK Independent Floor     1571
4 BHK Independent Floor      786
Name: count, dtype: int64 

location: 702 unique values
location
Wagholi         743
Andheri West    362
Thane West      287
Andheri East    284
Kharghar        255
Name: count, dtype: int64 

currency: 1 unique values
currency
INR    13910
Name: count, dtype: int64 

Status: 3 unique values
Status
Unfurnished       5613
Semi-Furnished    5548
Furnished         2749
Name: count, dtype: int64 

City: 4 unique values
City
delhi     5000
mumbai    4992
pune      3910
Delhi        8
Name: count, dtype: int64 



In [21]:
df['SecurityDeposit'].unique()[:10]


array(['No Deposit', ' 13,000', ' 70,000', ' 56,000', ' 46,000',
       ' 50,000', ' 66,000', ' 36,000', ' 65,41,919', ' 60,10,155'],
      dtype=object)

In [22]:
# Drop unnecessary columns
df.drop(['currency', 'verificationDate'], axis=1, inplace=True)

# Confirm the changes
print("Remaining columns:", df.columns.tolist())


Remaining columns: ['house_type', 'house_size', 'location', 'latitude', 'longitude', 'price', 'numBathrooms', 'numBalconies', 'priceSqFt', 'description', 'SecurityDeposit', 'Status', 'City']


In [23]:
def clean_deposit(value):
    if isinstance(value, str):
        value = value.strip()
        if value.lower() == 'no deposit':
            return 0
        return int(value.replace(',', ''))
    return value

# Apply cleaning
df['SecurityDeposit'] = df['SecurityDeposit'].apply(clean_deposit)

# Check results
print(df['SecurityDeposit'].unique()[:10])
print(df['SecurityDeposit'].dtype)


[      0   13000   70000   56000   46000   50000   66000   36000 6541919
 6010155]
int64


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910 entries, 0 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_type       13910 non-null  object 
 1   house_size       13910 non-null  int64  
 2   location         13910 non-null  object 
 3   latitude         13910 non-null  float64
 4   longitude        13910 non-null  float64
 5   price            13910 non-null  int64  
 6   numBathrooms     13910 non-null  float64
 7   numBalconies     13910 non-null  float64
 8   priceSqFt        13910 non-null  float64
 9   description      13910 non-null  object 
 10  SecurityDeposit  13910 non-null  int64  
 11  Status           13910 non-null  object 
 12  City             13910 non-null  object 
dtypes: float64(5), int64(3), object(5)
memory usage: 1.4+ MB


In [25]:
print(df['price'].unique()[:20])
print(df['price'].dtype)


[ 22000  20000   8500  48000  11000  35000  39000  90000  10000  15000
  24000  59000  45000  32500 220000  43000  30000  68000  55000  40000]
int64


## grouping the common places into areas and making it suitable for OHE

In [26]:
location_counts = df['location'].value_counts()
print(location_counts.head(10))
print(location_counts.tail(10))
print(len(location_counts))

location
Wagholi           743
Andheri West      362
Thane West        287
Andheri East      284
Kharghar          255
Dhanori           198
Ghansoli          196
Wakad             195
Defence Colony    189
Powai             175
Name: count, dtype: int64
location
Agalambe                  1
Ganj Peth                 1
Talegaon                  1
Pratik Nagar Mohanwadi    1
Shewalewadi               1
 Kharadi                  1
Kasarwadi                 1
Taljai Temple Road        1
Dighi Gaonthan            1
New DP Road               1
Name: count, dtype: int64
702


In [27]:
top_locations = df['location'].value_counts().nlargest(50).index

# Replace all other locations with "Other"
df['location'] = df['location'].apply(lambda x: x if x in top_locations else 'Other')


In [28]:
print(df['house_type'].value_counts())
print(df['house_type'].isnull().sum())


house_type
2 BHK Apartment              4385
1 BHK Apartment              2622
3 BHK Apartment              1837
3 BHK Independent Floor      1571
4 BHK Independent Floor       786
2 BHK Independent Floor       542
1 RK Studio Apartment         434
4 BHK Apartment               297
4 BHK Villa                   273
1 BHK Independent Floor       255
5 BHK Villa                   185
5 BHK Independent Floor       180
5 BHK Independent House       153
4 BHK Independent House       128
1 BHK Independent House        69
3 BHK Villa                    47
3 BHK Independent House        29
2 BHK Independent House        28
5 BHK Apartment                27
2 BHK Villa                    23
6 BHK Apartment                 6
6 BHK Independent Floor         6
6 BHK Villa                     5
1 BHK Villa                     5
6 BHK penthouse                 3
9 BHK Independent House         3
8 BHK Independent Floor         2
7 BHK Independent Floor         2
10 BHK Independent House        2
8 B

In [29]:
# Extract number of bedrooms (e.g., 1, 2, 3...) safely
df['bhk'] = df['house_type'].str.extract(r'(\d+)\s*BHK')[0].astype('Int64')

# Extract property type (e.g., Apartment, Villa, etc.)
df['property_type'] = df['house_type'].str.extract(r'BHK\s+(.*)')[0]

# Drop the original column if not needed
df.drop(columns=['house_type'], inplace=True)


In [30]:
print(df['bhk'].value_counts(dropna=False).sort_index())
print(df['property_type'].value_counts(dropna=False))


bhk
1       2951
2       4978
3       3484
4       1484
5        545
6         21
7          3
8          4
9          3
10         2
12         1
<NA>     434
Name: count, dtype: Int64
property_type
Apartment             9174
Independent Floor     3344
Villa                  539
NaN                    434
Independent House      416
penthouse                3
Name: count, dtype: int64


In [31]:
df['numBathrooms'].value_counts(dropna=False).sort_index()


numBathrooms
1.0     2755
2.0     6205
3.0     2846
4.0     1946
5.0      117
6.0       26
7.0        6
8.0        5
9.0        3
10.0       1
Name: count, dtype: int64

In [32]:
df = df[df['numBathrooms'] < 9].copy()


In [33]:
df['numBathrooms'].value_counts(dropna=False).sort_index()


numBathrooms
1.0    2755
2.0    6205
3.0    2846
4.0    1946
5.0     117
6.0      26
7.0       6
8.0       5
Name: count, dtype: int64

In [34]:
df['numBalconies'].value_counts(dropna=False).sort_index()


numBalconies
0.0    8617
1.0    1561
2.0    3204
3.0     369
4.0     133
5.0      16
6.0       5
8.0       1
Name: count, dtype: int64

In [35]:
df = df[df['numBalconies'] <= 6]


In [36]:
df['numBalconies'].value_counts().sort_index()


numBalconies
0.0    8617
1.0    1561
2.0    3204
3.0     369
4.0     133
5.0      16
6.0       5
Name: count, dtype: int64

In [37]:
df['priceSqFt'].describe()


count    13905.000000
mean        47.414322
std         39.476161
min          4.000000
25%         20.576132
50%         32.941176
75%         62.500000
max        382.024843
Name: priceSqFt, dtype: float64

4sq home baba re 

In [38]:
df['priceSqFt'].value_counts(dropna=False).head(20)


priceSqFt
20.000000     248
25.000000     158
105.396359    149
33.333333     141
50.000000     121
30.000000     107
40.000000      95
26.666667      94
66.666667      91
22.222222      87
51.053596      85
16.666667      84
68.014247      84
23.333333      83
105.399860     67
100.000000     62
55.555556      58
10.000000      57
51.055292      57
46.160405      56
Name: count, dtype: int64

In [39]:
df.drop(columns=['description'], inplace=True)


In [40]:
df['location'].value_counts(dropna=False)


location
Other                 6470
Wagholi                743
Andheri West           362
Thane West             287
Andheri East           284
Kharghar               255
Dhanori                198
Ghansoli               196
Wakad                  195
Defence Colony         189
Powai                  175
Chembur                158
Hinjewadi              157
Borivali East          153
Greater kailash 1      153
Santacruz East         151
Chattarpur             145
Kharadi                145
Vasant Vihar           145
Lohegaon               142
Jor bagh               125
Safdarjung Enclave     118
Baner                  117
Hadapsar               116
Kalyan West            115
Goregaon East          114
Mira Road East         112
Saket                  111
Dombivali              110
Goregaon West          108
Mulund West            108
Hauz Khas              107
Greater Kailash        107
Panchsheel Park        105
Golf Links             105
Kondhwa                103
Bandra West        

In [41]:
# Step 1: Group rare locations as 'Other'
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts < 50].index
df['location'] = df['location'].replace(rare_locations, 'Other')

# Step 2: Label encode
from sklearn.preprocessing import LabelEncoder
le_location = LabelEncoder()
df['location'] = le_location.fit_transform(df['location'])


In [42]:
df['latitude'].describe()
df['latitude'].isna().sum()


np.int64(0)

In [43]:
print("Min:", df['latitude'].min())
print("Max:", df['latitude'].max())


Min: 17.46821976
Max: 31.10658646


In [44]:
print(df['longitude'].isnull().sum())
print("Min:", df['longitude'].min())
print("Max:", df['longitude'].max())


0
Min: 18.49789619
Max: 91.79213715


In [45]:
swapped = df[
    (df['latitude'].between(68, 97)) &
    (df['longitude'].between(6, 37))
]
print("Potentially swapped rows:", len(swapped))
swapped.head()


Potentially swapped rows: 0


Unnamed: 0,house_size,location,latitude,longitude,price,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,Status,City,bhk,property_type


In [46]:
df = df[df['longitude'].between(68, 97)]


In [47]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 13900 entries, 0 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13900 non-null  int64  
 1   location         13900 non-null  int64  
 2   latitude         13900 non-null  float64
 3   longitude        13900 non-null  float64
 4   price            13900 non-null  int64  
 5   numBathrooms     13900 non-null  float64
 6   numBalconies     13900 non-null  float64
 7   priceSqFt        13900 non-null  float64
 8   SecurityDeposit  13900 non-null  int64  
 9   Status           13900 non-null  object 
 10  City             13900 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


In [48]:
df[['bhk', 'property_type']].isnull().sum()


bhk              434
property_type    434
dtype: int64

In [49]:
df[df['bhk'].isnull() | df['property_type'].isnull()]


Unnamed: 0,house_size,location,latitude,longitude,price,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,Status,City,bhk,property_type
0,400,37,28.545561,77.254349,22000,1.0,0.0,55.000000,0,Furnished,delhi,,
1,400,37,28.643259,77.132828,20000,1.0,0.0,50.000000,0,Furnished,delhi,,
29,300,37,28.653690,77.169159,15500,1.0,0.0,51.666667,0,Furnished,delhi,,
51,400,21,28.558958,77.204079,20000,1.0,0.0,50.000000,0,Semi-Furnished,delhi,,
67,400,37,28.560387,77.212837,14000,1.0,0.0,35.000000,0,Furnished,delhi,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13856,400,37,18.657989,73.774261,8500,1.0,1.0,21.250000,20000,Semi-Furnished,pune,,
13859,250,37,18.657471,73.768379,6000,1.0,1.0,24.000000,10000,Semi-Furnished,pune,,
13860,350,37,18.657471,73.768379,8500,2.0,1.0,24.285714,15000,Semi-Furnished,pune,,
13865,400,37,18.649200,73.765778,6000,1.0,1.0,15.000000,15000,Unfurnished,pune,,


In [50]:
df = df.dropna(subset=['bhk', 'property_type'])


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13466 entries, 2 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13466 non-null  int64  
 1   location         13466 non-null  int64  
 2   latitude         13466 non-null  float64
 3   longitude        13466 non-null  float64
 4   price            13466 non-null  int64  
 5   numBathrooms     13466 non-null  float64
 6   numBalconies     13466 non-null  float64
 7   priceSqFt        13466 non-null  float64
 8   SecurityDeposit  13466 non-null  int64  
 9   Status           13466 non-null  object 
 10  City             13466 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


In [52]:
df['priceSqFt'].describe()


count    13466.000000
mean        47.468209
std         39.701155
min          4.000000
25%         20.519836
50%         32.577849
75%         62.500000
max        382.024843
Name: priceSqFt, dtype: float64

In [53]:
df['priceSqFt'].isnull().sum()


np.int64(0)

In [54]:
df['Status'].describe()

count           13466
unique              3
top       Unfurnished
freq             5517
Name: Status, dtype: object

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13466 entries, 2 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13466 non-null  int64  
 1   location         13466 non-null  int64  
 2   latitude         13466 non-null  float64
 3   longitude        13466 non-null  float64
 4   price            13466 non-null  int64  
 5   numBathrooms     13466 non-null  float64
 6   numBalconies     13466 non-null  float64
 7   priceSqFt        13466 non-null  float64
 8   SecurityDeposit  13466 non-null  int64  
 9   Status           13466 non-null  object 
 10  City             13466 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


## TEST TRAIN SPLIT

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Backup your target
y = df['price']

# Drop target column
X = df.drop(columns=['price'])


In [60]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [61]:
categorical_cols = ['Status', 'City', 'property_type']
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'Int64']).columns.tolist()

# Remove 'price' and categorical ones from numeric list if present
numerical_cols = [col for col in numerical_cols if col not in categorical_cols]


In [64]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit only on train categorical columns
ohe.fit(X_train[categorical_cols])

# Transform both
X_train_cat = pd.DataFrame(
    ohe.transform(X_train[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train.index
)

X_test_cat = pd.DataFrame(
    ohe.transform(X_test[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_test.index
)


In [65]:
X_train_final = pd.concat([X_train[numerical_cols], X_train_cat], axis=1)
X_test_final = pd.concat([X_test[numerical_cols], X_test_cat], axis=1)


In [66]:
X_train_final

Unnamed: 0,house_size,location,latitude,longitude,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,bhk,Status_Furnished,...,Status_Unfurnished,City_Delhi,City_delhi,City_mumbai,City_pune,property_type_Apartment,property_type_Independent Floor,property_type_Independent House,property_type_Villa,property_type_penthouse
13560,960,50,18.599226,73.763298,2.0,1.0,15.625000,40000,2,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
11812,1102,33,18.620728,73.915016,2.0,0.0,18.148820,60000,2,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7854,2000,37,18.958918,72.812401,4.0,2.0,150.000000,1800000,3,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5767,990,35,19.283638,72.873894,2.0,0.0,22.222222,0,2,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
12320,1034,11,21.875740,83.992943,2.0,0.0,14.506770,0,2,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5352,650,45,19.227955,72.968185,2.0,0.0,49.230769,0,2,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
13858,650,37,18.644442,73.748817,1.0,0.0,16.153846,30000,1,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5557,1100,16,19.153334,72.885765,3.0,0.0,34.545455,0,3,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
875,1800,37,28.538738,77.249092,3.0,0.0,55.555556,0,3,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [67]:
X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10772 entries, 13560 to 7485
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   house_size                        10772 non-null  int64  
 1   location                          10772 non-null  int64  
 2   latitude                          10772 non-null  float64
 3   longitude                         10772 non-null  float64
 4   numBathrooms                      10772 non-null  float64
 5   numBalconies                      10772 non-null  float64
 6   priceSqFt                         10772 non-null  float64
 7   SecurityDeposit                   10772 non-null  int64  
 8   bhk                               10772 non-null  Int64  
 9   Status_Furnished                  10772 non-null  float64
 10  Status_Semi-Furnished             10772 non-null  float64
 11  Status_Unfurnished                10772 non-null  float64
 12  City_D

## Lets go with Linear Regression

In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# 1. Initialize and Train
lr_model = LinearRegression()
lr_model.fit(X_train_final, y_train)

# 2. Predict
y_train_pred = lr_model.predict(X_train_final)
y_test_pred = lr_model.predict(X_test_final)

# 3. Evaluate
def evaluate(y_true, y_pred, dataset_name):
    print(f"\n📊 Evaluation on {dataset_name}:")
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("R² Score:", r2_score(y_true, y_pred))

def extended_evaluate(y_true, y_pred, dataset_name, n_features):
    n = len(y_true)
    p = n_features
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    adj_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    
    print(f"\n📊 Evaluation on {dataset_name}:")
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R² Score:", r2)
    print("Adjusted R²:", adj_r2)
    print("MAPE:", mape, "%")


extended_evaluate(y_train, y_train_pred, "Train Set", X_train_final.shape[1])
extended_evaluate(y_test, y_test_pred, "Test Set", X_test_final.shape[1])



📊 Evaluation on Train Set:
MAE: 27871.404929420656
RMSE: 48743.768551141395
R² Score: 0.9361622454183319
Adjusted R²: 0.9360375391070561
MAPE: 77.24435614464402 %

📊 Evaluation on Test Set:
MAE: 29361.612248904676
RMSE: 57605.619967846105
R² Score: 0.9208449483127639
Adjusted R²: 0.9202228464843837
MAPE: 76.7514583812064 %


In [73]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime

def log_metrics(y_true, y_pred, dataset_name, model_name, n_features, log_file='LR_model_metrics.log'):
    n = len(y_true)
    p = n_features

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    adj_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

    log_entry = f"""
📅 Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
🧠 Model: {model_name}
📂 Dataset: {dataset_name}
🧮 Features: {p}
📊 MAE: {mae:.4f}
📊 RMSE: {rmse:.4f}
📊 R² Score: {r2:.4f}
📊 Adjusted R²: {adj_r2:.4f}
📊 MAPE: {mape:.2f}%

{'='*60}
"""

    print(log_entry)  # Also print on console

    # Append to log file
    with open(log_file, 'a') as f:
        f.write(log_entry)