# Importing Neccesary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Importing Data

In [2]:
data=pd.read_csv('Bengaluru_House_Data.csv')
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# Checking Basic Information about data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
********************
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
********************
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int64
********************
2 BHK         5199
3 BHK        

# Data Preprocessing

In [6]:
def availability(text):
    if text =='Ready To Move':
        return text
    else:
        return 'Upcoming'

In [7]:
data['availability']=data['availability'].apply(availability)

In [8]:
data['availability'].value_counts()

Ready To Move    10581
Upcoming          2739
Name: availability, dtype: int64

#### Missing Value Imputation

In [9]:
data.drop(columns=['society'],inplace=True)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   total_sqft    13320 non-null  object 
 5   bath          13247 non-null  float64
 6   balcony       12711 non-null  float64
 7   price         13320 non-null  float64
dtypes: float64(3), object(5)
memory usage: 832.6+ KB


In [11]:
data['balcony']=data['balcony'].fillna(data['balcony'].median())

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   total_sqft    13320 non-null  object 
 5   bath          13247 non-null  float64
 6   balcony       13320 non-null  float64
 7   price         13320 non-null  float64
dtypes: float64(3), object(5)
memory usage: 832.6+ KB


In [13]:
data=data.dropna()

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13246 entries, 0 to 13319
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13246 non-null  object 
 1   availability  13246 non-null  object 
 2   location      13246 non-null  object 
 3   size          13246 non-null  object 
 4   total_sqft    13246 non-null  object 
 5   bath          13246 non-null  float64
 6   balcony       13246 non-null  float64
 7   price         13246 non-null  float64
dtypes: float64(3), object(5)
memory usage: 931.4+ KB


In [15]:
#data['bhk']=data['size'].str.split().str.get(0).astype(int)

In [16]:
data['size']=data['size'].astype(str)
data['beds']=data['size'].apply(lambda x:x.split()[0])
data['beds']=data['beds'].astype(int)

In [17]:
data.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,beds
0,Super built-up Area,Upcoming,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [18]:
data['beds'].value_counts()

2     5527
3     4832
4     1395
1      649
5      353
6      221
7      100
8       89
9       54
10      14
11       4
27       1
19       1
16       1
43       1
14       1
12       1
13       1
18       1
Name: beds, dtype: int64

In [19]:
data=data[data['beds']<8]
data

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,beds
0,Super built-up Area,Upcoming,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,3600,5.0,2.0,400.00,4
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Super built-up Area,Upcoming,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [20]:
data['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [21]:
def convertrange(x):
    temp=x.split('-')
    if len(temp)==2:
        return (float(temp[0])+float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [22]:
data['total_sqft']=data['total_sqft'].apply(convertrange)

In [23]:
data['total_sqft'].value_counts()

1200.0    797
1100.0    221
1500.0    199
2400.0    189
600.0     173
         ... 
1922.0      1
2370.0      1
5.0         1
1004.0      1
4689.0      1
Name: total_sqft, Length: 1961, dtype: int64

#### Feature Construction

In [24]:
data['price_per_sqft']=data['price']*100000/data['total_sqft']

In [25]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,beds,price_per_sqft
count,13034.0,13077.0,13077.0,13077.0,13077.0,13034.0
mean,1543.264365,2.616732,1.598149,110.582923,2.720043,7852.883
std,1170.498553,1.096417,0.802417,144.468132,1.015861,107399.3
min,1.0,1.0,0.0,8.0,1.0,267.8298
25%,1100.0,2.0,1.0,50.0,2.0,4260.474
50%,1275.0,2.0,2.0,70.0,3.0,5416.667
75%,1665.0,3.0,2.0,119.0,3.0,7239.955
max,52272.0,9.0,3.0,2912.0,7.0,12000000.0


### Outlier detection and removing

In [26]:
ulm=data['price_per_sqft'].quantile(0.95)
llm=data['price_per_sqft'].quantile(0.05)
data=data[(data['price_per_sqft']>llm) & (data['price_per_sqft']<ulm)]

In [27]:
data.describe()

Unnamed: 0,total_sqft,bath,balcony,price,beds,price_per_sqft
count,11713.0,11713.0,11713.0,11713.0,11713.0,11713.0
mean,1519.41404,2.574575,1.61214,97.311312,2.671391,6057.64957
std,883.945949,1.020332,0.798038,87.271905,0.938794,2389.841336
min,276.0,1.0,0.0,13.5,1.0,3105.990783
25%,1107.83,2.0,1.0,51.0,2.0,4373.848987
50%,1290.0,2.0,2.0,70.0,3.0,5416.666667
75%,1664.0,3.0,2.0,110.0,3.0,6961.325967
max,30400.0,9.0,3.0,2100.0,7.0,14981.273408


In [28]:
data[data['bath']>10]

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,beds,price_per_sqft


In [29]:
data.drop('size',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('size',axis=1,inplace=True)


In [30]:
data['location']=data['location'].apply(lambda x:x.strip())
location_count=data['location'].value_counts()
location_count_10=location_count[location_count<=10]
data['location']=data['location'].apply(lambda x:'other' if x in location_count_10 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['location']=data['location'].apply(lambda x:x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['location']=data['location'].apply(lambda x:'other' if x in location_count_10 else x)


In [31]:
data['location'].value_counts()

other                    2598
Whitefield                516
Sarjapur  Road            379
Kanakpura Road            262
Thanisandra               230
                         ... 
Marsur                     11
Banashankari Stage V       11
Tindlu                     11
Judicial Layout            11
Banashankari Stage II      11
Name: location, Length: 217, dtype: int64

In [32]:
data.drop(columns=['area_type','price_per_sqft'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=['area_type','price_per_sqft'],inplace=True)


# Seperation of features and label for training and Validation 

In [33]:
X=data.drop('price',axis=1)
y=data['price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

# Column Transformer (Linear Regression)

In [34]:
from sklearn.compose import ColumnTransformer

In [35]:
from sklearn.preprocessing import OneHotEncoder

In [36]:
trf1=ColumnTransformer([
    ('onehotenc',OneHotEncoder(sparse=False),[0,1])
],remainder='passthrough')

In [37]:
trf2=ColumnTransformer([
    ('scaler',StandardScaler(),slice(0,227))
])

In [38]:
trf3=LinearRegression()

# Pipeline

In [39]:
from sklearn.pipeline import Pipeline,make_pipeline

In [40]:
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3)
])

In [41]:
pipe.fit(X_train,y_train)



In [42]:
y_pred=pipe.predict(X_test)

In [43]:
y_pred

array([136.20786148,  59.93196858, 214.67137219, ..., 196.81578804,
       104.22022788, 362.46643703])

In [44]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

In [45]:
mae=mean_absolute_error(y_test,y_pred)
mae

25.25859718778112

In [46]:
acc=r2_score(y_test,y_pred)

In [47]:
acc

0.7577218385256188

# Column Tranformer GradientBoosting Regressor

In [48]:
from sklearn.ensemble import GradientBoostingRegressor

In [49]:
trf3g=GradientBoostingRegressor()

In [50]:
pipegbr=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3g',trf3g)
])

In [51]:
pipegbr.fit(X_train,y_train)



In [52]:
y_predgbr=pipegbr.predict(X_test)
maegbr=mean_absolute_error(y_test,y_predgbr)
maegbr

23.367741481374697

In [53]:
r2_gbr=r2_score(y_test,y_predgbr)
r2_gbr

0.7962296801314525

# Hyperparameter Tuning Using GridSearchCV

In [54]:
from sklearn.model_selection import GridSearchCV

In [55]:
paramgbr={
    'trf3g__n_estimators':[50,100,200],
    'trf3g__min_samples_split':[5,10,20],
    'trf3g__max_depth':[2,5,10]
    
}

In [56]:
gridgbr = GridSearchCV(pipegbr, paramgbr, cv=3, scoring='r2')
gridgbr.fit(X_train, y_train)







In [57]:
gridgbr.best_score_

0.8133497803283843

In [58]:
gridgbr.best_params_

{'trf3g__max_depth': 5,
 'trf3g__min_samples_split': 5,
 'trf3g__n_estimators': 200}

In [59]:
trf3gg=GradientBoostingRegressor(max_depth=5,min_samples_split=5,n_estimators=200)

In [60]:
pipegbrg=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3gg',trf3gg)
])

In [61]:
pipegbrg.fit(X_train,y_train)
y_predgbrg=pipegbrg.predict(X_test)
maegbrgg=mean_absolute_error(y_test,y_predgbrg)
maegbrgg



22.187913462354167

### Best r2_score using above parameters is 0.8134 which is almost 2% higher than the gradient boosting without hyper parameter tuning

# Column Transformer SVR

In [62]:
from sklearn.svm import SVR

In [63]:
trf3s=SVR()

In [64]:
pipesvr=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3s',trf3s)
])

In [65]:
pipesvr.fit(X_train,y_train)
y_predsvr=pipesvr.predict(X_test)
maesvr=mean_absolute_error(y_test,y_predsvr)
maesvr



36.29097486774229

In [66]:
r2svr=r2_score(y_test,y_predsvr)
r2svr

0.2478693510475778

# GridSearchCV-SVR

In [67]:
paramsvr={
    'trf3s__kernel':['linear','poly','rbf'],
    'trf3s__degree':[2,3,4,5],
    'trf3s__C':[0.1,0.3,0.5,1]
}

In [68]:
gridsvr=GridSearchCV(pipesvr, paramsvr, cv=3, scoring='r2')
gridsvr.fit(X_train,y_train)













In [69]:
gridsvr.best_score_

0.7595425257493718

In [70]:
gridsvr.best_params_

{'trf3s__C': 1, 'trf3s__degree': 2, 'trf3s__kernel': 'linear'}

In [71]:
conclusion_df=pd.DataFrame(
    {'Algorithm':['LinearRegressor','GradientBoosting','GradientBoosting_With_Parameter','SVM','SVM_with_Parameter'],
    'R2_Score':[0.75,0.79,0.813,0.24,0.75],
    'MAE':[25.25,23.35,22.179,36.29,None]}
)
conclusion_df

Unnamed: 0,Algorithm,R2_Score,MAE
0,LinearRegressor,0.75,25.25
1,GradientBoosting,0.79,23.35
2,GradientBoosting_With_Parameter,0.813,22.179
3,SVM,0.24,36.29
4,SVM_with_Parameter,0.75,


## Conclusion:
#### Gradient Boosting Regressor has best R2_score and MAE out of three algorithms

### GradientBoostingwith HyperParameter has given best results So we will use GradientBoosting with above best Parameters for the Production 

# Pickling

In [72]:
import pickle

In [73]:
pickle.dump(pipegbrg,open('GBR_with_para.pkl','wb'))