# Importing libraries

In [1]:
! pip install xgboost

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Data Wrangling
import pandas as pd
import numpy as np

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


# Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler


pd.set_option('display.max_columns', None)


# Overviwing the data

In [3]:
data = pd.read_csv('./Bengaluru_House_Data.csv')
data

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [4]:
data.isna().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

# Cleaning the data

In [5]:
data.shape

(13320, 9)

In [6]:
# let's see how many area we have and the freq of each one of them
data.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [7]:
# dropping the columns that will not be used 
data = data.drop(['area_type','society','availability'],axis=1)
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00
4,Kothanur,2 BHK,1200,2.0,1.0,51.00
...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00
13316,Richards Town,4 BHK,3600,5.0,,400.00
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00


In [8]:
# how many nan values are there
data.isna().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [9]:
# droping nan values
data.dropna(inplace=True)


In [10]:
data.isna().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [11]:
data.shape

(12710, 6)

In [12]:
# how many unique sizes are there
data['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [13]:
# creating new column for the size
data['bhk'] = data['size'].apply(lambda x : int(x.split(' ')[0]) )


In [14]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [15]:
data[data['bhk'] > 20]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43


In [16]:
data.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [17]:
def is_it_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [18]:
data[~data.total_sqft.apply(is_it_float)].head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8,2


In [19]:

def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return np.average([float(tokens[0]), float(tokens[1])])
    try:
        return float(x)
    except:
        return None


In [20]:
# testing the function
convert_sqft_to_num('2100 - 2850')

2475.0

In [21]:
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)


In [22]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2


In [23]:
# how many data available per location
data.location = data.location.apply(lambda x : x.strip())
stats_of_location = data.groupby('location')['location'].agg('count')
stats_of_location.sort_values(ascending=False)

location
Whitefield             515
Sarjapur  Road         372
Electronic City        302
Kanakpura Road         261
Thanisandra            234
                      ... 
Whietfield,              1
Whitefield ECC Road      1
Williams Town            1
Xavier Layout            1
Viviani Road             1
Name: location, Length: 1254, dtype: int64

In [24]:
# if we have a location has less than 10 let's call it other
len(stats_of_location[stats_of_location<=10])

1017

In [25]:
# stats_location_less_than_10
stats_of_location_less_than_10 = stats_of_location[stats_of_location<=10]
stats_of_location_less_than_10.sort_values(ascending=False)

location
HAL 2nd Stage                                      10
Gunjur Palya                                       10
Ganga Nagar                                        10
Naganathapura                                      10
Nagappa Reddy Layout                               10
                                                   ..
12th cross srinivas nagar banshankari 3rd stage     1
1 Ramamurthy Nagar                                  1
1 Giri Nagar                                        1
5 Bedroom Farm House in Lakshmipura                 1
1 Annasandrapalya                                   1
Name: location, Length: 1017, dtype: int64

In [26]:
data.location = data.location.apply(lambda x: 'other' if x in stats_of_location_less_than_10 else x)

In [27]:
len(data.location.unique())

238

In [28]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4


# Outlier detection and removal 

In [29]:
data[data.total_sqft/data.bhk < 300]


Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
58,Murugeshpalya,6 Bedroom,1407.0,4.0,1.0,150.0,6
68,Devarachikkanahalli,8 Bedroom,1350.0,7.0,0.0,85.0,8
70,other,3 Bedroom,500.0,3.0,2.0,100.0,3
78,Kaval Byrasandra,2 BHK,460.0,1.0,0.0,22.0,2
89,Rajaji Nagar,6 Bedroom,710.0,6.0,3.0,160.0,6
...,...,...,...,...,...,...,...
13219,Laggere,7 Bedroom,1590.0,9.0,3.0,132.0,7
13221,other,9 Bedroom,1178.0,9.0,1.0,75.0,9
13281,Margondanahalli,5 Bedroom,1375.0,5.0,1.0,125.0,5
13303,Vidyaranyapura,5 Bedroom,774.0,5.0,3.0,70.0,5


In [30]:
data = data[~(data.total_sqft/data.bhk < 300)]
data.shape


(12055, 7)

In [31]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,bhk
count,12013.0,12055.0,12055.0,12055.0,12055.0
mean,1542.315982,2.513231,1.58623,105.120959,2.608461
std,1181.094228,1.009891,0.809461,134.149494,0.927608
min,300.0,1.0,0.0,9.0,1.0
25%,1107.0,2.0,1.0,48.45,2.0
50%,1285.0,2.0,2.0,68.0,2.0
75%,1660.0,3.0,2.0,110.0,3.0
max,52272.0,13.0,3.0,2912.0,13.0


In [32]:
data = data[data.bath < data.bhk+2]
data.shape

(11926, 7)

In [33]:
data


Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715.0,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453.0,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689.0,4.0,1.0,488.00,4


In [34]:
### taking some examples to test the model after we'll train it on all data
unseen_data = data.sample(4,random_state=42)
print("data shape:", data.shape)
unseen_data

data shape: (11926, 7)


Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
11641,Whitefield,3 BHK,3758.0,3.0,1.0,300.0,3
2294,Kengeri,3 BHK,1250.0,3.0,3.0,48.0,3
7335,other,1 Bedroom,600.0,1.0,0.0,52.0,1
391,Sompura,2 BHK,825.0,2.0,1.0,33.0,2


In [35]:
data = data.drop(unseen_data.index)
print(data.shape)

(11922, 7)


# Encoding time

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11922 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    11922 non-null  object 
 1   size        11922 non-null  object 
 2   total_sqft  11880 non-null  float64
 3   bath        11922 non-null  float64
 4   balcony     11922 non-null  float64
 5   price       11922 non-null  float64
 6   bhk         11922 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 745.1+ KB


In [37]:
data.columns

Index(['location', 'size', 'total_sqft', 'bath', 'balcony', 'price', 'bhk'], dtype='object')

In [38]:
from sklearn.preprocessing import LabelEncoder
label_encoder_location = LabelEncoder()


In [39]:
data['location'] = label_encoder_location.fit_transform(data['location'])

In [40]:
data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,78,2 BHK,1056.0,2.0,1.0,39.07,2
1,59,4 Bedroom,2600.0,5.0,3.0,120.00,4
2,222,3 BHK,1440.0,2.0,3.0,62.00,3
3,156,3 BHK,1521.0,3.0,1.0,95.00,3
4,148,2 BHK,1200.0,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,86,3 BHK,1715.0,3.0,3.0,112.00,3
13315,231,5 Bedroom,3453.0,4.0,0.0,231.00,5
13317,190,2 BHK,1141.0,2.0,1.0,60.00,2
13318,181,4 BHK,4689.0,4.0,1.0,488.00,4


In [41]:
data.columns

Index(['location', 'size', 'total_sqft', 'bath', 'balcony', 'price', 'bhk'], dtype='object')

# Scaling time

In [42]:
data.dropna(inplace=True)

In [43]:
X = data[['location', 'total_sqft', 'bath', 'balcony','bhk']]
y = data[['price']]

In [44]:
X

Unnamed: 0,location,total_sqft,bath,balcony,bhk
0,78,1056.0,2.0,1.0,2
1,59,2600.0,5.0,3.0,4
2,222,1440.0,2.0,3.0,3
3,156,1521.0,3.0,1.0,3
4,148,1200.0,2.0,1.0,2
...,...,...,...,...,...
13314,86,1715.0,3.0,3.0,3
13315,231,3453.0,4.0,0.0,5
13317,190,1141.0,2.0,1.0,2
13318,181,4689.0,4.0,1.0,4


In [45]:
y

Unnamed: 0,price
0,39.07
1,120.00
2,62.00
3,95.00
4,51.00
...,...
13314,112.00
13315,231.00
13317,60.00
13318,488.00


In [46]:
scaler = StandardScaler()

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [48]:
num_columns = X_train.columns


In [49]:
num_columns

Index(['location', 'total_sqft', 'bath', 'balcony', 'bhk'], dtype='object')

In [50]:
# Select numerical columns from X_train
X_train_num = X_train[num_columns]

# Apply StandardScaler to numerical columns
X_train_scaled = scaler.fit_transform(X_train_num)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=num_columns)

In [51]:
X_train_scaled

Unnamed: 0,location,total_sqft,bath,balcony,bhk
0,1.129502,-0.450860,-0.497199,-1.955249,-0.642906
1,-1.528422,-0.126335,0.555223,-1.955249,0.439116
2,-1.910005,0.146336,0.555223,1.763519,0.439116
3,0.918974,-0.014585,-0.497199,0.523930,0.439116
4,-1.238945,-0.989051,-1.549620,-0.715659,-1.724929
...,...,...,...,...,...
9499,-0.949468,-0.783430,-1.549620,-0.715659,-1.724929
9500,0.642655,-0.127229,0.555223,0.523930,0.439116
9501,0.563707,0.074816,0.555223,-1.955249,0.439116
9502,0.511074,-0.190704,-0.497199,0.523930,-0.642906


In [52]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=num_columns)
X_test_scaled

Unnamed: 0,location,total_sqft,bath,balcony,bhk
0,1.129502,-0.439238,-0.497199,0.523930,-0.642906
1,-1.186313,-0.135276,-0.497199,0.523930,0.439116
2,1.050554,-0.291727,-0.497199,0.523930,-0.642906
3,1.129502,0.790021,4.764911,1.763519,4.767206
4,0.168965,-0.282787,-0.497199,-1.955249,-0.642906
...,...,...,...,...,...
2371,-0.383672,-0.506288,-0.497199,-0.715659,-0.642906
2372,1.129502,-0.372187,-0.497199,0.523930,-0.642906
2373,0.840025,-0.297985,-0.497199,-0.715659,-0.642906
2374,0.392652,-0.401689,-0.497199,0.523930,-0.642906


# Modeling time

In [53]:
y_train.shape

(9504, 1)

In [54]:
# Convert y_train to a 1D array
y_train = np.ravel(y_train)
y_train.shape

(9504,)

### Let's find out which model is good

In [55]:
# Define models and their parameter grids
models = {
#     'SVR': (SVR(), {'model__C': [0.1,  1.0,  10.0], 'model__kernel': ['linear', 'rbf']}),
    'Random Forest': (RandomForestRegressor(), {'model__n_estimators': [100,  200,  300,500,1500,2000,2500,3000]}),
    'Linear Regression': (LinearRegression(),{'model__n_jobs':[1,2,3]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'model__n_estimators': [100,  200,  300,500,1500,2000,2500,3000], 'model__learning_rate': [0.01,0.001,  0.1, 1.0]}),
    'Decision Tree': (DecisionTreeRegressor(), {'model__max_depth': [None,  5,  10,  15,20,30,50,100]}),
    'KNN': (KNeighborsRegressor(), {'model__n_neighbors': range(1,  10)}),
}

# Apply GridSearchCV to each model
results = {}
for name, (model, param_grid) in models.items():
    pipe = Pipeline([('model', model)])
    grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2').fit(X_train_scaled, y_train)
    results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    }

In [56]:
# Find the best model
best_model_name = max(results, key=lambda x: results[x]['best_score'])
best_model_params = results[best_model_name]['best_params']
best_model_score = results[best_model_name]['best_score']

In [57]:
print(f"Best model: {best_model_name}")
print(f"Best parameters: {best_model_params}")
print(f"Best score: {best_model_score}")

Best model: Gradient Boosting
Best parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 500}
Best score: 0.6125193278040191


In [58]:

gbr_model = GradientBoostingRegressor(learning_rate= 0.1, n_estimators= 500)
gbr_model.fit(X_train_scaled,y_train)


In [59]:
y_pred = gbr_model.predict(X_test_scaled)

In [60]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 83.8161935393295


In [61]:
r2_score(y_pred,y_test)

0.46761125380241064

# Now let's train the model on all the data

In [62]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=num_columns)

In [63]:
y = np.ravel(y)
gbr_model_best = GradientBoostingRegressor(learning_rate= 0.001, n_estimators= 3000)
gbr_model_best.fit(X_scaled,y)

# Saving the trained model, scaler, and encoders

In [64]:
import joblib

In [65]:
# Save the scaler to a file
joblib.dump(scaler, 'scaler.pkl')

# Save the location encoder to a file
joblib.dump(label_encoder_location, 'encoder_location.pkl')

# Save the model to a file
joblib.dump(gbr_model_best, 'best_model.pkl')

['best_model.pkl']

# let's test it on the unseen data.

### Preprocessing the unseen data

In [66]:
unseen_data

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
11641,Whitefield,3 BHK,3758.0,3.0,1.0,300.0,3
2294,Kengeri,3 BHK,1250.0,3.0,3.0,48.0,3
7335,other,1 Bedroom,600.0,1.0,0.0,52.0,1
391,Sompura,2 BHK,825.0,2.0,1.0,33.0,2


In [67]:
# saving the price column
unseen_data_price = unseen_data['price']

#droping size, and price columns
unseen_data.drop(columns=['price','size'],inplace=True)

# encoding location 
unseen_data['location'] = label_encoder_location.transform(unseen_data['location'])
unseen_data

Unnamed: 0,location,total_sqft,bath,balcony,bhk
11641,231,3758.0,3.0,1.0,3
2294,138,1250.0,3.0,3.0,3
7335,237,600.0,1.0,0.0,1
391,209,825.0,2.0,1.0,2


In [68]:
# scaling the data
unseen_data = scaler.transform(unseen_data)
unseen_data_price

11641    300.0
2294      48.0
7335      52.0
391       33.0
Name: price, dtype: float64

### Testing the model

In [69]:
gbr_model_best.predict(unseen_data)



array([350.65094076,  78.27604088,  48.75866212,  49.94943106])