## Pre Processing and EDA

In [1]:
import pandas as pd

# Load the combined dataset
df = pd.read_csv("../data/processed/combined_data.csv")

# Basic preview
df.head()


Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,isNegotiable,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400 sq ft,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,,,,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400 sq ft,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,,,,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500 sq ft,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,,,,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,"1,020 sq ft",Model Town,Delhi,28.712898,77.18,48000,INR,3.0,,,,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810 sq ft,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,,,,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi


In [2]:
df.shape

(13910, 17)

In [3]:
df.columns.tolist()

['house_type',
 'house_size',
 'location',
 'city',
 'latitude',
 'longitude',
 'price',
 'currency',
 'numBathrooms',
 'numBalconies',
 'isNegotiable',
 'priceSqFt',
 'verificationDate',
 'description',
 'SecurityDeposit',
 'Status',
 'City']

In [4]:
df.isnull().sum()

house_type              0
house_size              0
location                0
city                    0
latitude                0
longitude               0
price                   0
currency                0
numBathrooms           56
numBalconies         8619
isNegotiable        12634
priceSqFt           13910
verificationDate        0
description           831
SecurityDeposit         0
Status                  0
City                    0
dtype: int64

#### house_type              0
#### house_size              0
#### location                0
#### city                    0
#### latitude                0
#### longitude               0
#### price                   0
#### currency                0
#### numBathrooms           56
#### numBalconies         8619
#### isNegotiable        12634
#### priceSqFt           13910
#### verificationDate        0
#### description           831
#### SecurityDeposit         0
#### Status                  0
#### City                    0


In [5]:
#### numBathrooms           56      int 
#### numBalconies         8619      int 
#### isNegotiable        12634      bool
#### priceSqFt           13910      float64
#### description         831        object



# Gonna fill the numBathrooms with median of the cols
df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBathrooms'].fillna(df['numBathrooms'].median(), inplace=True)


In [6]:
# Now fill the values of numBalconies with 0 as because the it has near about 50% data missing
df['numBalconies'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['numBalconies'].fillna(0, inplace=True)


In [7]:
# We will drop this column as it has more than 80% percentage of missing values so it will be removed 
df.drop(columns=['isNegotiable'], inplace=True)


In [8]:
# Step 1: Remove commas and 'sq ft', then strip whitespace
df['house_size'] = df['house_size'].str.replace(',', '')
df['house_size'] = df['house_size'].str.replace('sq ft', '')
df['house_size'] = df['house_size'].str.strip()

# Step 2: Convert to numeric
df['house_size'] = pd.to_numeric(df['house_size'], errors='coerce')



df['priceSqFt'] = df['price'] / df['house_size']


In [9]:
df.house_size.head( )           

0     400
1     400
2     500
3    1020
4     810
Name: house_size, dtype: int64

In [10]:
df.price.head( )

0    22000
1    20000
2     8500
3    48000
4    20000
Name: price, dtype: int64

In [11]:
df['description'].fillna("No description available", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['description'].fillna("No description available", inplace=True)


In [12]:
df.isnull().sum()

house_type          0
house_size          0
location            0
city                0
latitude            0
longitude           0
price               0
currency            0
numBathrooms        0
numBalconies        0
priceSqFt           0
verificationDate    0
description         0
SecurityDeposit     0
Status              0
City                0
dtype: int64

In [13]:
df.head()

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
0,1 RK Studio Apartment,400,Kalkaji,Delhi,28.545561,77.254349,22000,INR,1.0,0.0,55.0,Posted a day ago,"Fully furnished, loaded with amenities & gadge...",No Deposit,Furnished,Delhi
1,1 RK Studio Apartment,400,Mansarover Garden,Delhi,28.643259,77.132828,20000,INR,1.0,0.0,50.0,Posted 9 days ago,Here is an excellent 1 BHK Independent Floor a...,No Deposit,Furnished,Delhi
2,2 BHK Independent Floor,500,Uttam Nagar,Delhi,28.618677,77.053352,8500,INR,1.0,0.0,17.0,Posted 12 days ago,"Zero Brokerage.\n\n2 Room set, Govt bijali Met...",No Deposit,Semi-Furnished,Delhi
3,3 BHK Independent House,1020,Model Town,Delhi,28.712898,77.18,48000,INR,3.0,0.0,47.058824,Posted a year ago,Itâs a 3 bhk independent house situated in M...,No Deposit,Furnished,Delhi
4,2 BHK Apartment,810,Sector 13 Rohini,Delhi,28.723539,77.131424,20000,INR,2.0,0.0,24.691358,Posted a year ago,Well designed 2 bhk multistorey apartment is a...,No Deposit,Unfurnished,Delhi


In [14]:
# Show all columns with object (non-numeric) datatype
categorical_cols = df.select_dtypes(include='object').columns.tolist()
print("Categorical columns:", categorical_cols)

# Show unique values for each to inspect what needs encoding
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print(df[col].unique()[:10])  # Show up to 10 unique values for preview


Categorical columns: ['house_type', 'location', 'city', 'currency', 'verificationDate', 'description', 'SecurityDeposit', 'Status', 'City']

Column: house_type
['1 RK Studio Apartment ' '2 BHK Independent Floor '
 '3 BHK Independent House ' '2 BHK Apartment ' '3 BHK Apartment '
 '3 BHK Independent Floor ' '4 BHK Independent Floor '
 '1 BHK Independent Floor ' '1 BHK Apartment ' '8 BHK Independent Floor ']

Column: location
['Kalkaji' 'Mansarover Garden' 'Uttam Nagar' 'Model Town'
 'Sector 13 Rohini' 'DLF Farms' 'laxmi nagar' 'Swasthya Vihar' 'Janakpuri'
 'Pitampura']

Column: city
['Delhi' 'Mumbai' 'Hisar' 'Pune']

Column: currency
['INR']

Column: verificationDate
['Posted a day ago' 'Posted 9 days ago' 'Posted 12 days ago'
 'Posted a year ago' 'Posted 2 years ago' 'Posted 3 years ago'
 'Posted 2 months ago' 'Posted a month ago' 'Posted 17 days ago'
 'Posted 13 days ago']

Column: description
['Fully furnished, loaded with amenities & gadgets- 1RK + lobby Set, with all facilities, Par

In [15]:
df.info()
print("\n\nSample rows:")
display(df.sample(5))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910 entries, 0 to 13909
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_type        13910 non-null  object 
 1   house_size        13910 non-null  int64  
 2   location          13910 non-null  object 
 3   city              13910 non-null  object 
 4   latitude          13910 non-null  float64
 5   longitude         13910 non-null  float64
 6   price             13910 non-null  int64  
 7   currency          13910 non-null  object 
 8   numBathrooms      13910 non-null  float64
 9   numBalconies      13910 non-null  float64
 10  priceSqFt         13910 non-null  float64
 11  verificationDate  13910 non-null  object 
 12  description       13910 non-null  object 
 13  SecurityDeposit   13910 non-null  object 
 14  Status            13910 non-null  object 
 15  City              13910 non-null  object 
dtypes: float64(5), int64(2), object(9)
memor

Unnamed: 0,house_type,house_size,location,city,latitude,longitude,price,currency,numBathrooms,numBalconies,priceSqFt,verificationDate,description,SecurityDeposit,Status,City
10182,2 BHK Apartment,1024,Wakad,Pune,18.60643,73.753082,30000,INR,2.0,0.0,29.296875,Posted 15 days ago,This spacious 2 bhk multistorey apartment is a...,No Deposit,Semi-Furnished,Pune
12691,2 BHK Apartment,1250,Vishrantwadi,Pune,18.569496,73.8787,20000,INR,2.0,0.0,16.0,Posted 2 months ago,"2 BHK, newly distempered, sliding windows, mos...",50000,Unfurnished,Pune
10583,1 BHK Apartment,630,Kargil Vijay Nagar,Pune,18.55928,73.920441,16000,INR,1.0,0.0,25.396825,Posted 23 days ago,A 1 bhk property is available for rental in An...,No Deposit,Unfurnished,Pune
2371,2 BHK Apartment,1250,Sector 11 Dwarka,Delhi,28.592112,77.04821,22000,INR,2.0,3.0,17.6,Posted 3 years ago,No description available,44000,Semi-Furnished,Delhi
1794,1 RK Studio Apartment,500,Uttam Nagar,Delhi,28.620892,77.057983,6499,INR,1.0,0.0,12.998,Posted 6 months ago,This Independent Floor can be a comfortable an...,No Deposit,Furnished,Delhi


In [16]:
# Standardize and compare the two columns
df['city'] = df['city'].str.lower().str.strip()
df['City'] = df['City'].str.lower().str.strip()

# Drop 'City' if both are the same
if (df['city'] == df['City']).all():
    df.drop('City', axis=1, inplace=True)
    print("✅ Dropped 'City' column as it duplicates 'city'")
else:
    print("'city' and 'City' contain mismatched values. Manual inspection needed.")


'city' and 'City' contain mismatched values. Manual inspection needed.


In [17]:
mismatch = df[df['city'].str.lower().str.strip() != df['City'].str.lower().str.strip()]
display(mismatch[['city', 'City']])
print(f"⚠️ Total mismatches found: {len(mismatch)}")


Unnamed: 0,city,City
7724,hisar,mumbai
7842,hisar,mumbai
8548,hisar,mumbai
8559,hisar,mumbai
8849,hisar,mumbai
8945,hisar,mumbai
9409,hisar,mumbai
9420,hisar,mumbai


⚠️ Total mismatches found: 8


In [18]:
# List of index numbers to update
indices_to_update = [7724, 7842, 8548, 8559, 8849, 8945, 9409, 9420]

# Update 'City' column to 'Delhi' for these indices
df.loc[indices_to_update, 'City'] = 'Delhi'

# Drop the 'city' column
df.drop(columns=['city'], inplace=True)


In [19]:
for col in ['house_type', 'location', 'currency', 'Status', 'City']:
    print(f"{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts().head(), '\n')


house_type: 34 unique values
house_type
2 BHK Apartment             4385
1 BHK Apartment             2622
3 BHK Apartment             1837
3 BHK Independent Floor     1571
4 BHK Independent Floor      786
Name: count, dtype: int64 

location: 702 unique values
location
Wagholi         743
Andheri West    362
Thane West      287
Andheri East    284
Kharghar        255
Name: count, dtype: int64 

currency: 1 unique values
currency
INR    13910
Name: count, dtype: int64 

Status: 3 unique values
Status
Unfurnished       5613
Semi-Furnished    5548
Furnished         2749
Name: count, dtype: int64 

City: 4 unique values
City
delhi     5000
mumbai    4992
pune      3910
Delhi        8
Name: count, dtype: int64 



In [20]:
df['SecurityDeposit'].unique()[:10]


array(['No Deposit', ' 13,000', ' 70,000', ' 56,000', ' 46,000',
       ' 50,000', ' 66,000', ' 36,000', ' 65,41,919', ' 60,10,155'],
      dtype=object)

In [21]:
# Drop unnecessary columns
df.drop(['currency', 'verificationDate'], axis=1, inplace=True)

# Confirm the changes
print("Remaining columns:", df.columns.tolist())


Remaining columns: ['house_type', 'house_size', 'location', 'latitude', 'longitude', 'price', 'numBathrooms', 'numBalconies', 'priceSqFt', 'description', 'SecurityDeposit', 'Status', 'City']


In [22]:
def clean_deposit(value):
    if isinstance(value, str):
        value = value.strip()
        if value.lower() == 'no deposit':
            return 0
        return int(value.replace(',', ''))
    return value

# Apply cleaning
df['SecurityDeposit'] = df['SecurityDeposit'].apply(clean_deposit)

# Check results
print(df['SecurityDeposit'].unique()[:10])
print(df['SecurityDeposit'].dtype)


[      0   13000   70000   56000   46000   50000   66000   36000 6541919
 6010155]
int64


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910 entries, 0 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_type       13910 non-null  object 
 1   house_size       13910 non-null  int64  
 2   location         13910 non-null  object 
 3   latitude         13910 non-null  float64
 4   longitude        13910 non-null  float64
 5   price            13910 non-null  int64  
 6   numBathrooms     13910 non-null  float64
 7   numBalconies     13910 non-null  float64
 8   priceSqFt        13910 non-null  float64
 9   description      13910 non-null  object 
 10  SecurityDeposit  13910 non-null  int64  
 11  Status           13910 non-null  object 
 12  City             13910 non-null  object 
dtypes: float64(5), int64(3), object(5)
memory usage: 1.4+ MB


In [24]:
print(df['price'].unique()[:20])
print(df['price'].dtype)


[ 22000  20000   8500  48000  11000  35000  39000  90000  10000  15000
  24000  59000  45000  32500 220000  43000  30000  68000  55000  40000]
int64


## grouping the common places into areas and making it suitable for OHE

In [25]:
location_counts = df['location'].value_counts()
print(location_counts.head(10))
print(location_counts.tail(10))
print(len(location_counts))

location
Wagholi           743
Andheri West      362
Thane West        287
Andheri East      284
Kharghar          255
Dhanori           198
Ghansoli          196
Wakad             195
Defence Colony    189
Powai             175
Name: count, dtype: int64
location
Agalambe                  1
Ganj Peth                 1
Talegaon                  1
Pratik Nagar Mohanwadi    1
Shewalewadi               1
 Kharadi                  1
Kasarwadi                 1
Taljai Temple Road        1
Dighi Gaonthan            1
New DP Road               1
Name: count, dtype: int64
702


In [26]:
top_locations = df['location'].value_counts().nlargest(50).index

# Replace all other locations with "Other"
df['location'] = df['location'].apply(lambda x: x if x in top_locations else 'Other')


In [27]:
print(df['house_type'].value_counts())
print(df['house_type'].isnull().sum())


house_type
2 BHK Apartment              4385
1 BHK Apartment              2622
3 BHK Apartment              1837
3 BHK Independent Floor      1571
4 BHK Independent Floor       786
2 BHK Independent Floor       542
1 RK Studio Apartment         434
4 BHK Apartment               297
4 BHK Villa                   273
1 BHK Independent Floor       255
5 BHK Villa                   185
5 BHK Independent Floor       180
5 BHK Independent House       153
4 BHK Independent House       128
1 BHK Independent House        69
3 BHK Villa                    47
3 BHK Independent House        29
2 BHK Independent House        28
5 BHK Apartment                27
2 BHK Villa                    23
6 BHK Apartment                 6
6 BHK Independent Floor         6
6 BHK Villa                     5
1 BHK Villa                     5
6 BHK penthouse                 3
9 BHK Independent House         3
8 BHK Independent Floor         2
7 BHK Independent Floor         2
10 BHK Independent House        2
8 B

In [28]:
# Extract number of bedrooms (e.g., 1, 2, 3...) safely
df['bhk'] = df['house_type'].str.extract(r'(\d+)\s*BHK')[0].astype('Int64')

# Extract property type (e.g., Apartment, Villa, etc.)
df['property_type'] = df['house_type'].str.extract(r'BHK\s+(.*)')[0]

# Drop the original column if not needed
df.drop(columns=['house_type'], inplace=True)


In [29]:
print(df['bhk'].value_counts(dropna=False).sort_index())
print(df['property_type'].value_counts(dropna=False))


bhk
1       2951
2       4978
3       3484
4       1484
5        545
6         21
7          3
8          4
9          3
10         2
12         1
<NA>     434
Name: count, dtype: Int64
property_type
Apartment             9174
Independent Floor     3344
Villa                  539
NaN                    434
Independent House      416
penthouse                3
Name: count, dtype: int64


In [30]:
df['numBathrooms'].value_counts(dropna=False).sort_index()


numBathrooms
1.0     2755
2.0     6205
3.0     2846
4.0     1946
5.0      117
6.0       26
7.0        6
8.0        5
9.0        3
10.0       1
Name: count, dtype: int64

In [31]:
df = df[df['numBathrooms'] < 9].copy()


In [32]:
df['numBathrooms'].value_counts(dropna=False).sort_index()


numBathrooms
1.0    2755
2.0    6205
3.0    2846
4.0    1946
5.0     117
6.0      26
7.0       6
8.0       5
Name: count, dtype: int64

In [33]:
df['numBalconies'].value_counts(dropna=False).sort_index()


numBalconies
0.0    8617
1.0    1561
2.0    3204
3.0     369
4.0     133
5.0      16
6.0       5
8.0       1
Name: count, dtype: int64

In [34]:
df = df[df['numBalconies'] <= 6]


In [35]:
df['numBalconies'].value_counts().sort_index()


numBalconies
0.0    8617
1.0    1561
2.0    3204
3.0     369
4.0     133
5.0      16
6.0       5
Name: count, dtype: int64

In [36]:
df['priceSqFt'].describe()


count    13905.000000
mean        47.414322
std         39.476161
min          4.000000
25%         20.576132
50%         32.941176
75%         62.500000
max        382.024843
Name: priceSqFt, dtype: float64

4sq home baba re 

In [37]:
df['priceSqFt'].value_counts(dropna=False).head(20)


priceSqFt
20.000000     248
25.000000     158
105.396359    149
33.333333     141
50.000000     121
30.000000     107
40.000000      95
26.666667      94
66.666667      91
22.222222      87
51.053596      85
16.666667      84
68.014247      84
23.333333      83
105.399860     67
100.000000     62
55.555556      58
10.000000      57
51.055292      57
46.160405      56
Name: count, dtype: int64

In [38]:
df.drop(columns=['description'], inplace=True)


In [39]:
df['location'].value_counts(dropna=False)


location
Other                 6470
Wagholi                743
Andheri West           362
Thane West             287
Andheri East           284
Kharghar               255
Dhanori                198
Ghansoli               196
Wakad                  195
Defence Colony         189
Powai                  175
Chembur                158
Hinjewadi              157
Borivali East          153
Greater kailash 1      153
Santacruz East         151
Chattarpur             145
Kharadi                145
Vasant Vihar           145
Lohegaon               142
Jor bagh               125
Safdarjung Enclave     118
Baner                  117
Hadapsar               116
Kalyan West            115
Goregaon East          114
Mira Road East         112
Saket                  111
Dombivali              110
Goregaon West          108
Mulund West            108
Hauz Khas              107
Greater Kailash        107
Panchsheel Park        105
Golf Links             105
Kondhwa                103
Bandra West        

In [40]:
# Step 1: Group rare locations as 'Other'
location_counts = df['location'].value_counts()
rare_locations = location_counts[location_counts < 50].index
df['location'] = df['location'].replace(rare_locations, 'Other')

# Step 2: Label encode
from sklearn.preprocessing import LabelEncoder
le_location = LabelEncoder()
df['location'] = le_location.fit_transform(df['location'])


In [41]:
df['latitude'].describe()
df['latitude'].isna().sum()


np.int64(0)

In [42]:
print("Min:", df['latitude'].min())
print("Max:", df['latitude'].max())


Min: 17.46821976
Max: 31.10658646


In [43]:
print(df['longitude'].isnull().sum())
print("Min:", df['longitude'].min())
print("Max:", df['longitude'].max())


0
Min: 18.49789619
Max: 91.79213715


In [44]:
swapped = df[
    (df['latitude'].between(68, 97)) &
    (df['longitude'].between(6, 37))
]
print("Potentially swapped rows:", len(swapped))
swapped.head()


Potentially swapped rows: 0


Unnamed: 0,house_size,location,latitude,longitude,price,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,Status,City,bhk,property_type


In [45]:
df = df[df['longitude'].between(68, 97)]


In [46]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 13900 entries, 0 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13900 non-null  int64  
 1   location         13900 non-null  int64  
 2   latitude         13900 non-null  float64
 3   longitude        13900 non-null  float64
 4   price            13900 non-null  int64  
 5   numBathrooms     13900 non-null  float64
 6   numBalconies     13900 non-null  float64
 7   priceSqFt        13900 non-null  float64
 8   SecurityDeposit  13900 non-null  int64  
 9   Status           13900 non-null  object 
 10  City             13900 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


In [47]:
df[['bhk', 'property_type']].isnull().sum()


bhk              434
property_type    434
dtype: int64

In [48]:
df[df['bhk'].isnull() | df['property_type'].isnull()]


Unnamed: 0,house_size,location,latitude,longitude,price,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,Status,City,bhk,property_type
0,400,37,28.545561,77.254349,22000,1.0,0.0,55.000000,0,Furnished,delhi,,
1,400,37,28.643259,77.132828,20000,1.0,0.0,50.000000,0,Furnished,delhi,,
29,300,37,28.653690,77.169159,15500,1.0,0.0,51.666667,0,Furnished,delhi,,
51,400,21,28.558958,77.204079,20000,1.0,0.0,50.000000,0,Semi-Furnished,delhi,,
67,400,37,28.560387,77.212837,14000,1.0,0.0,35.000000,0,Furnished,delhi,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13856,400,37,18.657989,73.774261,8500,1.0,1.0,21.250000,20000,Semi-Furnished,pune,,
13859,250,37,18.657471,73.768379,6000,1.0,1.0,24.000000,10000,Semi-Furnished,pune,,
13860,350,37,18.657471,73.768379,8500,2.0,1.0,24.285714,15000,Semi-Furnished,pune,,
13865,400,37,18.649200,73.765778,6000,1.0,1.0,15.000000,15000,Unfurnished,pune,,


In [49]:
df = df.dropna(subset=['bhk', 'property_type'])


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13466 entries, 2 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13466 non-null  int64  
 1   location         13466 non-null  int64  
 2   latitude         13466 non-null  float64
 3   longitude        13466 non-null  float64
 4   price            13466 non-null  int64  
 5   numBathrooms     13466 non-null  float64
 6   numBalconies     13466 non-null  float64
 7   priceSqFt        13466 non-null  float64
 8   SecurityDeposit  13466 non-null  int64  
 9   Status           13466 non-null  object 
 10  City             13466 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


In [51]:
df['priceSqFt'].describe()


count    13466.000000
mean        47.468209
std         39.701155
min          4.000000
25%         20.519836
50%         32.577849
75%         62.500000
max        382.024843
Name: priceSqFt, dtype: float64

In [52]:
df['priceSqFt'].isnull().sum()


np.int64(0)

In [53]:
df['Status'].describe()

count           13466
unique              3
top       Unfurnished
freq             5517
Name: Status, dtype: object

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13466 entries, 2 to 13909
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   house_size       13466 non-null  int64  
 1   location         13466 non-null  int64  
 2   latitude         13466 non-null  float64
 3   longitude        13466 non-null  float64
 4   price            13466 non-null  int64  
 5   numBathrooms     13466 non-null  float64
 6   numBalconies     13466 non-null  float64
 7   priceSqFt        13466 non-null  float64
 8   SecurityDeposit  13466 non-null  int64  
 9   Status           13466 non-null  object 
 10  City             13466 non-null  object 
 11  bhk              13466 non-null  Int64  
 12  property_type    13466 non-null  object 
dtypes: Int64(1), float64(5), int64(4), object(3)
memory usage: 1.5+ MB


## TEST TRAIN SPLIT

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Backup your target
y = df['price']

# Drop target column
X = df.drop(columns=['price'])


In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [57]:
categorical_cols = ['Status', 'City', 'property_type']
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'Int64']).columns.tolist()

# Remove 'price' and categorical ones from numeric list if present
numerical_cols = [col for col in numerical_cols if col not in categorical_cols]


In [58]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit only on train categorical columns
ohe.fit(X_train[categorical_cols])

# Transform both
X_train_cat = pd.DataFrame(
    ohe.transform(X_train[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_train.index
)

X_test_cat = pd.DataFrame(
    ohe.transform(X_test[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=X_test.index
)


In [59]:
X_train_final = pd.concat([X_train[numerical_cols], X_train_cat], axis=1)
X_test_final = pd.concat([X_test[numerical_cols], X_test_cat], axis=1)


In [60]:
X_train_final

Unnamed: 0,house_size,location,latitude,longitude,numBathrooms,numBalconies,priceSqFt,SecurityDeposit,bhk,Status_Furnished,...,Status_Unfurnished,City_Delhi,City_delhi,City_mumbai,City_pune,property_type_Apartment,property_type_Independent Floor,property_type_Independent House,property_type_Villa,property_type_penthouse
13560,960,50,18.599226,73.763298,2.0,1.0,15.625000,40000,2,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
11812,1102,33,18.620728,73.915016,2.0,0.0,18.148820,60000,2,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
7854,2000,37,18.958918,72.812401,4.0,2.0,150.000000,1800000,3,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5767,990,35,19.283638,72.873894,2.0,0.0,22.222222,0,2,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
12320,1034,11,21.875740,83.992943,2.0,0.0,14.506770,0,2,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5352,650,45,19.227955,72.968185,2.0,0.0,49.230769,0,2,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
13858,650,37,18.644442,73.748817,1.0,0.0,16.153846,30000,1,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5557,1100,16,19.153334,72.885765,3.0,0.0,34.545455,0,3,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
875,1800,37,28.538738,77.249092,3.0,0.0,55.555556,0,3,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [61]:
X_train_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10772 entries, 13560 to 7485
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   house_size                        10772 non-null  int64  
 1   location                          10772 non-null  int64  
 2   latitude                          10772 non-null  float64
 3   longitude                         10772 non-null  float64
 4   numBathrooms                      10772 non-null  float64
 5   numBalconies                      10772 non-null  float64
 6   priceSqFt                         10772 non-null  float64
 7   SecurityDeposit                   10772 non-null  int64  
 8   bhk                               10772 non-null  Int64  
 9   Status_Furnished                  10772 non-null  float64
 10  Status_Semi-Furnished             10772 non-null  float64
 11  Status_Unfurnished                10772 non-null  float64
 12  City_D

## Random Forest

In [73]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# === Load Data ===
data_path = r"D:\Programming\python\Python AI ML projects\House_Price_Prediction\backend\data\processed\combined_data.csv"
df = pd.read_csv(data_path)

# === Separate Features & Target ===
X = df.drop(columns=['price'])
y = df['price']

# === Split before Encoding ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Identify column types ===
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# === Preprocessing pipeline ===
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# === Combine preprocessing and model into pipeline ===
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1))
])

# === Train the model ===
start_time = time.time()
rf_model.fit(X_train, y_train)
train_time = time.time() - start_time

# === Predict ===
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

# === Metrics Function ===
def get_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, r2, adj_r2, mape

# === Evaluate ===
train_metrics = get_metrics(y_train, train_preds)
test_metrics = get_metrics(y_test, test_preds)

# === Print Results ===
def print_metrics(name, metrics):
    print(f"📊 Evaluation on {name} Set:")
    print(f"MAE: {metrics[0]:.2f}")
    print(f"RMSE: {metrics[1]:.2f}")
    print(f"R² Score: {metrics[2]:.6f}")
    print(f"Adjusted R²: {metrics[3]:.6f}")
    print(f"MAPE: {metrics[4]:.2f} %\n")

print_metrics("Train", train_metrics)
print_metrics("Test", test_metrics)

# === Save Logs ===
log_folder = os.path.join("backend", "logs")
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, "random_forest_metrics.log")

with open(log_file, "a") as f:
    f.write(f"\n=== Random Forest Regressor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n")
    f.write(f"Training Time: {train_time:.2f} sec\n")
    f.write("Train Metrics:\n")
    f.write(f"  MAE: {train_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {train_metrics[1]:.2f}\n")
    f.write(f"  R²: {train_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {train_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {train_metrics[4]:.2f} %\n")
    f.write("Test Metrics:\n")
    f.write(f"  MAE: {test_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {test_metrics[1]:.2f}\n")
    f.write(f"  R²: {test_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {test_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {test_metrics[4]:.2f} %\n")
    f.write("===============================================\n")

print(f"📁 Metrics logged to: {log_file}")




📊 Evaluation on Train Set:
MAE: 6437.15
RMSE: 22385.77
R² Score: 0.986640
Adjusted R²: 0.986620
MAPE: 6.86 %

📊 Evaluation on Test Set:
MAE: 18000.93
RMSE: 54352.98
R² Score: 0.920226
Adjusted R²: 0.919764
MAPE: 16.63 %

📁 Metrics logged to: backend\logs\random_forest_metrics.log




In [88]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.3-py3-none-win_amd64.whl (149.9 MB)
   ---------------------------------------- 0.0/149.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/149.9 MB 6.7 MB/s eta 0:00:23
   ---------------------------------------- 1.8/149.9 MB 4.4 MB/s eta 0:00:34
    --------------------------------------- 2.4/149.9 MB 3.7 MB/s eta 0:00:40
    --------------------------------------- 3.1/149.9 MB 4.1 MB/s eta 0:00:36
   - -------------------------------------- 3.9/149.9 MB 3.9 MB/s eta 0:00:38
   - -------------------------------------- 4.7/149.9 MB 3.8 MB/s eta 0:00:39
   - -------------------------------------- 5.2/149.9 MB 3.7 MB/s eta 0:00:40
   - -------------------------------------- 6.0/149.9 MB 3.7 MB/s eta 0:00:40
   - -------------------------------------- 6.8/149.9 MB 3.6 MB/s eta 0:00:40
   - -------------------------------------- 7.3/149.9 MB 3.6 MB/s eta 0:00:40
 


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### XG-Boost

In [89]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from xgboost import XGBRegressor

# === Load Data ===
data_path = r"D:\Programming\python\Python AI ML projects\House_Price_Prediction\backend\data\processed\combined_data.csv"
df = pd.read_csv(data_path)

# === Separate Features & Target ===
X = df.drop(columns=['price'])
y = df['price']

# === Split before Encoding ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Identify column types ===
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# === Preprocessing pipeline ===
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# === Combine preprocessing and model into pipeline ===
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, n_jobs=-1))
])

# === Train the model ===
start_time = time.time()
xgb_model.fit(X_train, y_train)
train_time = time.time() - start_time

# === Predict ===
train_preds = xgb_model.predict(X_train)
test_preds = xgb_model.predict(X_test)

# === Metrics Function ===
def get_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, r2, adj_r2, mape

# === Evaluate ===
train_metrics = get_metrics(y_train, train_preds)
test_metrics = get_metrics(y_test, test_preds)

# === Print Results ===
def print_metrics(name, metrics):
    print(f"📊 Evaluation on {name} Set:")
    print(f"MAE: {metrics[0]:.2f}")
    print(f"RMSE: {metrics[1]:.2f}")
    print(f"R² Score: {metrics[2]:.6f}")
    print(f"Adjusted R²: {metrics[3]:.6f}")
    print(f"MAPE: {metrics[4]:.2f} %\n")

print_metrics("Train", train_metrics)
print_metrics("Test", test_metrics)

# === Save Logs ===
log_folder = os.path.join("backend", "logs")
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, "xgboost_metrics.log")

with open(log_file, "a") as f:
    f.write(f"\n=== XGBoost Regressor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n")
    f.write(f"Training Time: {train_time:.2f} sec\n")
    f.write("Train Metrics:\n")
    f.write(f"  MAE: {train_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {train_metrics[1]:.2f}\n")
    f.write(f"  R²: {train_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {train_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {train_metrics[4]:.2f} %\n")
    f.write("Test Metrics:\n")
    f.write(f"  MAE: {test_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {test_metrics[1]:.2f}\n")
    f.write(f"  R²: {test_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {test_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {test_metrics[4]:.2f} %\n")
    f.write("===============================================\n")

print(f"📁 Metrics logged to: {log_file}")




📊 Evaluation on Train Set:
MAE: 20148.41
RMSE: 39526.67
R² Score: 0.958346
Adjusted R²: 0.958286
MAPE: 37.98 %

📊 Evaluation on Test Set:
MAE: 23284.77
RMSE: 54940.66
R² Score: 0.918492
Adjusted R²: 0.918020
MAPE: 36.20 %

📁 Metrics logged to: backend\logs\xgboost_metrics.log




In [93]:
%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --------------------- ------------------ 0.8/1.5 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.5 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


### LightGBM ,SVR ,KNN , AdaBoost,

In [94]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from datetime import datetime

# === Load Data ===
data_path = r"D:\Programming\python\Python AI ML projects\House_Price_Prediction\backend\data\processed\combined_data.csv"
df = pd.read_csv(data_path)

# === Separate Features & Target ===
X = df.drop(columns=['price'])
y = df['price']

# === Split before Encoding ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Identify column types ===
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# === Preprocessing pipeline ===
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# === Combine preprocessing and model into pipeline ===
lgbm_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1))
])

# === Train the model ===
start_time = time.time()
lgbm_model.fit(X_train, y_train)
train_time = time.time() - start_time

# === Predict ===
train_preds = lgbm_model.predict(X_train)
test_preds = lgbm_model.predict(X_test)

# === Metrics Function ===
def get_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, r2, adj_r2, mape

# === Evaluate ===
train_metrics = get_metrics(y_train, train_preds)
test_metrics = get_metrics(y_test, test_preds)

# === Print Results ===
def print_metrics(name, metrics):
    print(f"📊 Evaluation on {name} Set:")
    print(f"MAE: {metrics[0]:.2f}")
    print(f"RMSE: {metrics[1]:.2f}")
    print(f"R² Score: {metrics[2]:.6f}")
    print(f"Adjusted R²: {metrics[3]:.6f}")
    print(f"MAPE: {metrics[4]:.2f} %\n")

print_metrics("Train", train_metrics)
print_metrics("Test", test_metrics)

# === Save Logs ===
log_folder = os.path.join("backend", "logs")
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, "lightgbm_metrics.log")

with open(log_file, "a", encoding="utf-8") as f:
    f.write(f"\n=== LightGBM Regressor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n")
    f.write(f"Training Time: {train_time:.2f} sec\n")
    f.write("Train Metrics:\n")
    f.write(f"  MAE: {train_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {train_metrics[1]:.2f}\n")
    f.write(f"  R²: {train_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {train_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {train_metrics[4]:.2f} %\n")
    f.write("Test Metrics:\n")
    f.write(f"  MAE: {test_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {test_metrics[1]:.2f}\n")
    f.write(f"  R²: {test_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {test_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {test_metrics[4]:.2f} %\n")
    f.write("===============================================\n")

print(f"📁 Metrics logged to: {log_file}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1198
[LightGBM] [Info] Number of data points in the train set: 11128, number of used features: 338
[LightGBM] [Info] Start training from score 107889.131919




📊 Evaluation on Train Set:
MAE: 19266.96
RMSE: 47644.88
R² Score: 0.939479
Adjusted R²: 0.939391
MAPE: 29.55 %

📊 Evaluation on Test Set:
MAE: 23810.24
RMSE: 59862.01
R² Score: 0.903235
Adjusted R²: 0.902675
MAPE: 29.22 %

📁 Metrics logged to: backend\logs\lightgbm_metrics.log


### LSTM 


In [95]:
%pip install tensorflow scikit-learn pandas


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Downloading protobuf-5.29.5-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting te

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.74.0 requires protobuf<7.0.0,>=6.31.1, but you have protobuf 5.29.5 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\HP\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [97]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# === Load Data ===
df = pd.read_csv(r"D:\Programming\python\Python AI ML projects\House_Price_Prediction\backend\data\processed\combined_data.csv")

# === Separate Features & Target ===
X = df.drop(columns=['price'])
y = df['price']

# === Identify Columns ===
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# === Preprocessing ===
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# === Apply preprocessing ===
X_processed = preprocessor.fit_transform(X)

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# === Reshape for LSTM ===
X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# === Build LSTM Model ===
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(1, X_train.shape[1])),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# === Train Model ===
start_time = time.time()
history = model.fit(
    X_train_reshaped, y_train,
    validation_split=0.1,
    epochs=100,
    batch_size=32,
    callbacks=[EarlyStopping(patience=10, restore_best_weights=True)],
    verbose=1
)
train_time = time.time() - start_time

# === Predict ===
train_preds = model.predict(X_train_reshaped).flatten()
test_preds = model.predict(X_test_reshaped).flatten()

# === Metrics ===
def get_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - (1 - r2) * (len(y_true) - 1) / (len(y_true) - X_train.shape[1] - 1)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, r2, adj_r2, mape

train_metrics = get_metrics(y_train, train_preds)
test_metrics = get_metrics(y_test, test_preds)

# === Print Metrics ===
def print_metrics(name, metrics):
    print(f"📊 {name} Set:")
    print(f"MAE: {metrics[0]:.2f}")
    print(f"RMSE: {metrics[1]:.2f}")
    print(f"R² Score: {metrics[2]:.6f}")
    print(f"Adjusted R²: {metrics[3]:.6f}")
    print(f"MAPE: {metrics[4]:.2f} %\n")

print_metrics("Train", train_metrics)
print_metrics("Test", test_metrics)

# === Save to Logs ===
log_folder = os.path.join("backend", "logs")
os.makedirs(log_folder, exist_ok=True)
log_file = os.path.join(log_folder, "lstm_metrics.log")

with open(log_file, "a", encoding="utf-8") as f:
    f.write(f"\n=== LSTM Regressor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ===\n")
    f.write(f"Training Time: {train_time:.2f} sec\n")
    f.write("Train Metrics:\n")
    f.write(f"  MAE: {train_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {train_metrics[1]:.2f}\n")
    f.write(f"  R²: {train_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {train_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {train_metrics[4]:.2f} %\n")
    f.write("Test Metrics:\n")
    f.write(f"  MAE: {test_metrics[0]:.2f}\n")
    f.write(f"  RMSE: {test_metrics[1]:.2f}\n")
    f.write(f"  R²: {test_metrics[2]:.6f}\n")
    f.write(f"  Adjusted R²: {test_metrics[3]:.6f}\n")
    f.write(f"  MAPE: {test_metrics[4]:.2f} %\n")
    f.write("===============================================\n")

print(f"📁 Metrics logged to: {log_file}")


  super().__init__(**kwargs)


Epoch 1/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - loss: 49306693632.0000 - mae: 107513.3828 - val_loss: 47483310080.0000 - val_mae: 110236.8984
Epoch 2/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - loss: 49209606144.0000 - mae: 107062.9297 - val_loss: 47348293632.0000 - val_mae: 109619.7656
Epoch 3/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - loss: 49045319680.0000 - mae: 106293.5938 - val_loss: 47146905600.0000 - val_mae: 108695.9219
Epoch 4/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - loss: 48821796864.0000 - mae: 105229.5078 - val_loss: 46889144320.0000 - val_mae: 107502.9219
Epoch 5/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 30ms/step - loss: 48546009088.0000 - mae: 103907.9375 - val_loss: 46579593216.0000 - val_mae: 106058.1172
Epoch 6/100
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3