### House Price Prediction 

### Import Modules

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error,make_scorer
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV,cross_val_score

### Import the data set 

In [2]:
df = pd.read_csv(r"C:\Users\rkeer\Downloads\bengaluru_house_prices.csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [3]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Shape of a dataset

In [4]:
df.shape

(13320, 9)

### Check the Missing values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
for column in df.columns:
    print(df[column].value_counts())
    print("*"*20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
********************
size
2 BHK    

### Drop Unwanted columns

In [7]:
df.drop(columns=['area_type','availability','balcony','society'], inplace = True)

In [8]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


### Fill the Missing Values

In [10]:
df['bath']= df['bath'].fillna(df['bath'].median())
df['size'].fillna('Unknown',inplace=True)
df['location'] = df['location'].fillna('Sarjapur Road')

In [11]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


### Change the datatype total_sqft object to float64

In [13]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None

In [14]:
df['Total_sqft'] = df['total_sqft'].apply(convertRange)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   Total_sqft  13274 non-null  float64
dtypes: float64(3), object(3)
memory usage: 624.5+ KB


In [16]:
df['Total_sqft'].isnull().sum()

46

In [17]:
df['Total_sqft']= df['Total_sqft'].fillna(df['Total_sqft'].median())

In [18]:
df.describe()

Unnamed: 0,bath,price,Total_sqft
count,13320.0,13320.0,13320.0
mean,2.688814,112.565627,1558.647202
std,1.338754,148.971674,1236.376834
min,1.0,8.0,1.0
25%,2.0,50.0,1100.0
50%,2.0,72.0,1276.0
75%,3.0,120.0,1678.0
max,40.0,3600.0,52272.0


### Splitting the data

In [19]:
X = df['Total_sqft']
Y = df['price']

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 42)

### Check the shape of a X_train 

In [23]:
X_train.shape

(10656, 1)

### Reshape a X_train

In [22]:
X_train = X_train.values.reshape(-1,1)

### Check the shape of a X_test 

In [26]:
X_test.shape

(2664, 1)

### Reshape a X_test

In [25]:
X_test = X_test.values.reshape(-1,1)

### Create a Model

In [27]:
Model = LinearRegression()

### Fit the Model

In [28]:
Model.fit(X_train,Y_train)

### Predict the model

In [29]:
pred = Model.predict(X_test)        

In [30]:
r2_score(Y_test,pred)

0.46308281979912524

In [31]:
mean_absolute_error(Y_test,pred)

50.16412582139308

In [32]:
mean_squared_error(Y_test,pred)

11431.255686427681

In [33]:
X.shape

(13320,)

In [34]:
X = X.values.reshape(-1,1)

In [35]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
cv_scores = cross_val_score(Model, X, Y, scoring=mse_scorer, cv=5)
cv_scores = -cv_scores

print("Cross-Validation MSE Scores:", cv_scores)
print("Mean MSE:", np.mean(cv_scores))
print("Standard Deviation of MSE:", np.std(cv_scores))

Cross-Validation MSE Scores: [17120.41632613 10204.91163919 18706.31838923 14619.83425292
 19254.46290887]
Mean MSE: 15981.188703267084
Standard Deviation of MSE: 3306.271557739117
