### Import Required Libraries

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import re
import joblib

### Data Preprocessing of train.csv

In [96]:
trained_data = pd.read_csv("train.csv")
print(trained_data.info())
trained_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10656 entries, 0 to 10655
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            10656 non-null  int64  
 1   area_type     10656 non-null  object 
 2   availability  10656 non-null  object 
 3   location      10655 non-null  object 
 4   size          10642 non-null  object 
 5   society       6228 non-null   object 
 6   total_sqft    10656 non-null  object 
 7   bath          10591 non-null  float64
 8   balcony       10152 non-null  float64
 9   price         10656 non-null  float64
dtypes: float64(3), int64(1), object(6)
memory usage: 832.6+ KB
None


Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0


In [98]:
missing_count = trained_data.isnull().sum()
print(missing_count)

ID                 0
area_type          0
availability       0
location           1
size              14
society         4428
total_sqft         0
bath              65
balcony          504
price              0
dtype: int64


In [100]:
trained_data = trained_data.dropna(subset=['location'])
trained_data = trained_data.dropna(subset=['size'])
trained_data['size'] = trained_data['size'].str.replace('Bedroom', 'BHK')
trained_data['size'].unique()

array(['2 BHK', '4 BHK', '3 BHK', '6 BHK', '1 BHK', '1 RK', '8 BHK',
       '7 BHK', '5 BHK', '11 BHK', '9 BHK', '27 BHK', '10 BHK', '19 BHK',
       '16 BHK', '43 BHK', '14 BHK', '12 BHK', '13 BHK'], dtype=object)

In [102]:
trained_data = trained_data.drop('society', axis=1)

In [104]:
unique_total_sqft = trained_data['total_sqft'].unique()
print(unique_total_sqft)

['1056' '2600' '1440' ... '2495' '3075' '1426']


In [106]:
non_integer_values = trained_data[trained_data['total_sqft'].apply(lambda x: not str(x).isdigit())]
print(non_integer_values['total_sqft'])
print(non_integer_values.shape[0])

30          2100 - 2850
44              1330.74
56          3010 - 3410
81          2957 - 3450
122         3067 - 8156
              ...      
10488         1.25Acres
10491    86.72Sq. Meter
10541         381 - 535
10569       1230 - 1490
10620       3630 - 3800
Name: total_sqft, Length: 263, dtype: object
263


In [108]:
# If there is any value with something like 1230 - 1490; then replace with a mid-value of the same
# also, convert from all other unit types to sqr feet
def process_total_sqft(value):
    value = str(value.strip().lower().replace(" ", ""))
    if '-' in value:
        # Split the string at the hyphen, strip spaces, and calculate the average
        parts = value.split('-')
        part1 = float(parts[0].strip())  # Trim spaces and convert to float
        part2 = float(parts[1].strip())  # Trim spaces and convert to float
        avg_value = (part1 + part2) / 2
        return str(avg_value)
    elif value.endswith(".0"):
        return value[:-2]
    elif (value.endswith("acres") or value.endswith("acre") or value.endswith("acr") or value.endswith("acrs")):
        value = value.replace("acres", "").replace("acre", "").replace("acr", "").replace("acrs", "")
        return str(float(value) * 43560)
    elif (value.endswith("sq.meter") or value.endswith("sq.mtr") or value.endswith("sqmeter") or value.endswith("sqmtr")):
        value = value.replace("sq.meter", "").replace("sq.mtr", "").replace("sqmeter", "").replace("sqmtr", "")
        return str(float(value) * 10.7639)
    elif (value.endswith("perch") or value.endswith("prch")):
        value = value.replace("perch", "").replace("prch", "")
        return str(float(value) * 272.25)
    elif (value.endswith("sq.yards") or value.endswith("sq.yard") or value.endswith("sqyrd") or value.endswith("sqyrds")):
        value = value.replace("sq.yards", "").replace("sq.yard", "").replace("sqyrd", "").replace("sqyrds", "")
        return str(float(value) * 9)
    elif (value.endswith("cents") or value.endswith("cent") or value.endswith("cnts") or value.endswith("cnt")):
        value = value.replace("cents", "").replace("cent", "").replace("cnts", "").replace("cnt", "")
        return str(float(value) * 435.6)
    elif (value.endswith("gunthas") or value.endswith("guntha") or value.endswith("guntas") or value.endswith("gunta")):
        value = value.replace("gunthas", "").replace("guntha", "").replace("guntas", "").replace("gunta", "")
        return str(float(value) * 1089)
    elif (value.endswith("grounds") or value.endswith("ground") or value.endswith("grnds") or value.endswith("grnd")):
        value = value.replace("grounds", "").replace("ground", "").replace("grnds", "").replace("grnd", "")
        return str(float(value) * 2400)
    else:
        return value

# Apply the function to the 'total_sqft' column
trained_data['total_sqft'] = trained_data['total_sqft'].apply(process_total_sqft)
trained_data['total_sqft'] = trained_data['total_sqft'].apply(process_total_sqft)
trained_data['total_sqft'] = trained_data['total_sqft'].astype(float).round().astype(int)

In [110]:
unique_total_sqft = trained_data['area_type'].unique()
print(unique_total_sqft)

['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']


In [112]:
grouped_size_data = trained_data.groupby('size').size()
filtered_bath_data = trained_data[trained_data['bath'].notnull()]
grouped_avg_bath = filtered_bath_data.groupby('size')['bath'].mean().astype(float).round().astype(int)

# Replace blank values in 'bath' with grouped average
def fill_blank_bath(row):
    if pd.isnull(row['bath']):  # Check if 'bath' is null
        return grouped_avg_bath.get(row['size'], row['bath'])  # Get the average for the group or keep as is
    return row['bath']
trained_data['bath'] = trained_data.apply(fill_blank_bath, axis=1)


In [114]:

grouped_balcony_data = trained_data.groupby('balcony').size()
filtered_balcony_data = trained_data[trained_data['balcony'].notnull()]
grouped_avg_balcony = filtered_balcony_data.groupby('size')['balcony'].mean().astype(float).round().astype(int)

# Replace blank values in 'bath' with grouped average
def fill_blank_balcony(row):
    if pd.isnull(row['balcony']):  # Check if 'bath' is null
        return grouped_avg_balcony.get(row['size'], row['balcony'])  # Get the average for the group or keep as is
    return row['balcony']
trained_data['balcony'] = trained_data.apply(fill_blank_balcony, axis=1)
average_balcony = round(trained_data['balcony'].mean())
trained_data['balcony'] = trained_data['balcony'].fillna(average_balcony)

In [116]:
missing_count = trained_data.isnull().sum()
print(missing_count)

ID              0
area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64


### Merge Primary and Secondary data sets to train_data and process

In [118]:
dist_data = pd.read_csv('dist_from_city_centre.csv')
rent_data = pd.read_csv('avg_rent.csv')

In [120]:
dist_data.head(3)

Unnamed: 0,location,dist_from_city
0,Whitefield,17.3
1,Sarjapur Road,17.2
2,Electronic City,18.1


In [122]:
rent_data.head(3)

Unnamed: 0,location,avg_2bhk_rent
0,Krishnarajapura,11954
1,Sarjapur,45000
2,Whitefield Hope Farm Junction,26370


In [124]:
train_data = trained_data.merge(dist_data, on='location', how='left')
train_data = train_data.merge(rent_data, on='location', how='left')
train_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,price,dist_from_city,avg_2bhk_rent
0,0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,19.3,11500.0
1,1,Plot Area,Ready To Move,Chikka Tirupathi,4 BHK,2600,5.0,3.0,120.0,34.6,
2,2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,12.9,19750.0


In [126]:
train_missing_count = train_data.isnull().sum()
print(train_missing_count)

ID                   0
area_type            0
availability         0
location             0
size                 0
total_sqft           0
bath                 0
balcony              0
price                0
dist_from_city    1023
avg_2bhk_rent     6983
dtype: int64


In [128]:
train_data.fillna(train_data['dist_from_city'].mean(), inplace=True)
train_data.fillna(train_data['avg_2bhk_rent'].mean(), inplace=True)

In [130]:
train_missing_count = train_data.isnull().sum()
print(train_missing_count)

ID                0
area_type         0
availability      0
location          0
size              0
total_sqft        0
bath              0
balcony           0
price             0
dist_from_city    0
avg_2bhk_rent     0
dtype: int64


In [132]:
train_data['price_per_sqft'] = train_data['price'] / train_data['total_sqft']
for column in ['area_type', 'availability', 'location', 'size']:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column])

In [134]:
train_data['price_per_sqft'] = train_data['price'] / train_data['total_sqft']
train_data['rent_to_price_ratio'] = train_data['avg_2bhk_rent'] / train_data['price']
# Separate features and target in training data
X_train = train_data.drop(columns=['price'])
y_train = train_data['price']

In [136]:
train_data.to_csv('trained.csv', index=False)

### Data Preprocessing of test_data

In [139]:
test_data = pd.read_csv("test.csv")
print(test_data.info())
test_data.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2664 entries, 0 to 2663
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            2664 non-null   int64  
 1   area_type     2664 non-null   object 
 2   availability  2664 non-null   object 
 3   location      2664 non-null   object 
 4   size          2662 non-null   object 
 5   society       1590 non-null   object 
 6   total_sqft    2664 non-null   object 
 7   bath          2656 non-null   float64
 8   balcony       2559 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 187.4+ KB
None


Unnamed: 0,ID,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,,650,1.0,1.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,SrncyRe,1370,2.0,1.0
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,AjhalNa,1725,3.0,2.0


In [141]:
missing_count = test_data.isnull().sum()
print(missing_count)

ID                 0
area_type          0
availability       0
location           0
size               2
society         1074
total_sqft         0
bath               8
balcony          105
dtype: int64


In [143]:
test_data = test_data.dropna(subset=['location'])
test_data = test_data.dropna(subset=['size'])
test_data['size'] = test_data['size'].str.replace('Bedroom', 'BHK')
test_data['size'].unique()

array(['2 BHK', '3 BHK', '1 BHK', '4 BHK', '5 BHK', '9 BHK', '6 BHK',
       '8 BHK', '7 BHK', '10 BHK', '18 BHK'], dtype=object)

In [145]:
test_data = test_data.drop('society', axis=1)

In [34]:
unique_total_sqft = test_data['total_sqft'].unique()
print(unique_total_sqft)

['650' '1370' '1725' '1000' '1350' '3200' '1717' '700' '800' '2367' '1415'
 '1976' '883' '1445' '2082' '1140' '933' '1155' '1130' '1200' '1055'
 '1555' '1420' '2214' '658' '896.9' '4000' '1035' '2064' '1839' '488'
 '2760' '993' '1225' '900' '825' '3596' '1800' '925' '1167' '1150' '675'
 '2260' '660 - 780' '4800' '1100' '550' '1330' '1320' '1685' '2400' '1770'
 '2100' '600' '2916' '1419.59' '1400' '750' '1070' '1536' '1710'
 '1618 - 1929' '1342' '1600' '938' '1820' '2439' '1665' '1850' '1091'
 '1346' '1720' '3675' '1060' '1059' '1299' '1491' '2088' '581' '1296'
 '1125' '1275' '1095' '10000' '1135' '516' '1194' '2250' '770' '3930'
 '1355' '3000' '2569' '3758' '1116' '1707' '6000' '1240' '1300' '1700'
 '1383' '935' '2072' '2200' '615' '1639' '625' '2289' '2316' '1005' '1080'
 '910' '1110' '1142' '1626.6' '1074' '1230' '660' '1007' '1050' '1348'
 '1176' '1590' '1298' '1161' '1285' '1644' '1819' '1205' '2350' '1270'
 '1221' '840' '1339' '4303' '1315' '1250' '850' '1312' '1990' '760' '5000'


In [149]:
non_integer_values = test_data[test_data['total_sqft'].apply(lambda x: not str(x).isdigit())]
print(non_integer_values['total_sqft'])
print(non_integer_values.shape[0])

27            896.9
50        660 - 780
63          1419.59
70      1618 - 1929
138          1626.6
           ...     
2430        5665.84
2467        1331.95
2584    1020 - 1130
2609    1133 - 1384
2643    2830 - 2882
Name: total_sqft, Length: 63, dtype: object
63


In [151]:
test_data['total_sqft'] = test_data['total_sqft'].apply(process_total_sqft)
test_data['total_sqft'] = test_data['total_sqft'].apply(process_total_sqft)
test_data['total_sqft'] = test_data['total_sqft'].astype(float).round().astype(int)

In [153]:
unique_total_sqft = test_data['area_type'].unique()
print(unique_total_sqft)

['Super built-up  Area' 'Built-up  Area' 'Plot  Area' 'Carpet  Area']


In [155]:
grouped_size_data = test_data.groupby('size').size()
filtered_bath_data = test_data[test_data['bath'].notnull()]
grouped_avg_bath = filtered_bath_data.groupby('size')['bath'].mean().astype(float).round().astype(int)

# Replace blank values in 'bath' with grouped average
def fill_blank_bath(row):
    if pd.isnull(row['bath']):  # Check if 'bath' is null
        return grouped_avg_bath.get(row['size'], row['bath'])  # Get the average for the group or keep as is
    return row['bath']
test_data['bath'] = test_data.apply(fill_blank_bath, axis=1)

In [157]:
grouped_balcony_data = test_data.groupby('balcony').size()
filtered_balcony_data = test_data[test_data['balcony'].notnull()]
grouped_avg_balcony = filtered_balcony_data.groupby('size')['balcony'].mean().astype(float).round().astype(int)

# Replace blank values in 'bath' with grouped average
def fill_blank_balcony(row):
    if pd.isnull(row['balcony']):  # Check if 'bath' is null
        return grouped_avg_balcony.get(row['size'], row['balcony'])  # Get the average for the group or keep as is
    return row['balcony']
test_data['balcony'] = test_data.apply(fill_blank_balcony, axis=1)
average_balcony = round(test_data['balcony'].mean())
test_data['balcony'] = test_data['balcony'].fillna(average_balcony)

In [159]:
missing_count = test_data.isnull().sum()
print(missing_count)

ID              0
area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
dtype: int64


### Merge Primary and Secondary data sets to test_data and process

In [161]:
test_data = test_data.merge(dist_data, on='location', how='left')
test_data = test_data.merge(rent_data, on='location', how='left')
test_data.head(3)

Unnamed: 0,ID,area_type,availability,location,size,total_sqft,bath,balcony,dist_from_city,avg_2bhk_rent
0,0,Super built-up Area,Ready To Move,Chamrajpet,2 BHK,650,1.0,1.0,6.7,15875.0
1,1,Super built-up Area,Ready To Move,7th Phase JP Nagar,3 BHK,1370,2.0,1.0,11.0,
2,2,Super built-up Area,Ready To Move,Whitefield,3 BHK,1725,3.0,2.0,17.3,14981.0


In [163]:
test_missing_count = test_data.isnull().sum()
print(test_missing_count)

ID                   0
area_type            0
availability         0
location             0
size                 0
total_sqft           0
bath                 0
balcony              0
dist_from_city     274
avg_2bhk_rent     1751
dtype: int64


In [165]:
test_data.fillna(test_data['dist_from_city'].mean(), inplace=True)
test_data.fillna(test_data['avg_2bhk_rent'].mean(), inplace=True)

In [169]:
for column in ['area_type', 'availability', 'location', 'size']:
        le = LabelEncoder()
        test_data[column] = le.fit_transform(test_data[column])

In [171]:
test_missing_count = test_data.isnull().sum()
print(test_missing_count)

ID                0
area_type         0
availability      0
location          0
size              0
total_sqft        0
bath              0
balcony           0
dist_from_city    0
avg_2bhk_rent     0
dtype: int64


In [173]:
X_test = test_data
test_data['price'] = 0
y_test = test_data['price']

In [None]:
'test2'

In [175]:
test_data.to_csv('tested_data.csv', index=False)

### Feature engineering

In [90]:
# Encode categorical variables for both training and test data
scaler = StandardScaler()
numeric_features = ['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath', 'balcony', 'price', 'dist_from_city', 'avg_2bhk_rent', 'price_per_sqft', 'rent_to_price_ratio']
X_train_scaled = scaler.fit_transform(train_data[numeric_features])

### Model selection and evaluation

In [82]:
# Instantiate and train a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [83]:
# Predict on test data
y_pred = rf_model.predict(X_test_scaled)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE on test data: {rmse}")

RMSE on test data: 177.53237631434916


### Hyper Parameter Tuning

In [92]:
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Evaluate best model
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
print(f"Optimized RMSE: {rmse_best}")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   6.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   6.3s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   6.3s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   6.4s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  12.9s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  12.7s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  12.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  11.6s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=  12.2s
[CV] END max_depth=10, min_samples_split=2, n_estimators=300; total time=  18.0s
[CV] END max_depth=10, min_samples_split=2, n_e

### Save and load Model

In [None]:
# Save the best model
joblib.dump(best_model, 'house_price_model.pkl')

# Save the scaler for deployment
joblib.dump(scaler, 'scaler.pkl')


In [None]:
# Load the model and scaler
loaded_model = joblib.load('house_price_model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Predict for a new dataset
new_data = pd.DataFrame({
    'total_sqrt': [1200],
    'size': [3],
    'Distance_from_City_Center': [5],
    'Average_Rent': [1500]
})

# Preprocess the new data
new_data_scaled = loaded_scaler.transform(new_data)
predicted_price = loaded_model.predict(new_data_scaled)
print(f"Predicted House Price: {predicted_price[0]}")


In [None]:
'test11'

In [None]:
""" label_encoders = {}
for column in ['area_type', 'availability', 'location', 'size']:
    le = LabelEncoder()
    trained_data[column] = le.fit_transform(trained_data[column])
    label_encoders[column] = le
"""