In [79]:
import numpy as np
import pandas as pd
import seaborn as sns

In [80]:
df = pd.read_csv("Bengaluru_House_Data.csv")

In [81]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [82]:
columns_to_drop = ['area_type', 'availability', 'location', 'society']

In [83]:
df = df.drop(columns = columns_to_drop, errors = "ignore")

In [84]:
df.head()

Unnamed: 0,size,total_sqft,bath,balcony,price
0,2 BHK,1056,2.0,1.0,39.07
1,4 Bedroom,2600,5.0,3.0,120.0
2,3 BHK,1440,2.0,3.0,62.0
3,3 BHK,1521,3.0,1.0,95.0
4,2 BHK,1200,2.0,1.0,51.0


In [85]:
df1 = df.dropna()

In [86]:
df1.shape

(12711, 5)

In [87]:
df1.isnull().sum()

size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [88]:
#coverting the total_sqft's datatype into float
#here we observe that 'total_sqft' contain string value in diff format
#float, int like value 1689.28,817 
# range value: 540 - 740 
# number and string: 142.84Sq. Meter, 117Sq. Yards, 1Grounds
# best strategy is to convert it into number by spliting it

total_sqft_int = []
for str_val in df1['total_sqft']:
    try:
        total_sqft_int.append(float(str_val)) # if '123.4' like this value in str then conver in float
    except:
        try:
            temp = []
            temp = str_val.split('-')
            total_sqft_int.append((float(temp[0])+float(temp[-1]))/2) # '123 - 534' this str value split and take mean
        except:
            total_sqft_int.append(np.nan) # if value not contain in above format then consider as nan


In [89]:
df2 = df1.join(pd.DataFrame({'total_sqft_int':total_sqft_int}))
df2.head()

Unnamed: 0,size,total_sqft,bath,balcony,price,total_sqft_int
0,2 BHK,1056,2.0,1.0,39.07,1056.0
1,4 Bedroom,2600,5.0,3.0,120.0,2600.0
2,3 BHK,1440,2.0,3.0,62.0,1440.0
3,3 BHK,1521,3.0,1.0,95.0,1521.0
4,2 BHK,1200,2.0,1.0,51.0,1200.0


In [90]:
#Changing the size feature to show the numeric values only
size_int = []
for str_val in df2['size']:
    temp=[]
    temp = str_val.split(" ")
    try:
        size_int.append(int(temp[0]))
    except:
        size_int.append(np.nan)

In [91]:
df3 = df2.join(pd.DataFrame({'bhk':size_int}))
df3.shape

(12711, 7)

In [92]:
df3.head()

Unnamed: 0,size,total_sqft,bath,balcony,price,total_sqft_int,bhk
0,2 BHK,1056,2.0,1.0,39.07,1056.0,2.0
1,4 Bedroom,2600,5.0,3.0,120.0,2600.0,4.0
2,3 BHK,1440,2.0,3.0,62.0,1440.0,3.0
3,3 BHK,1521,3.0,1.0,95.0,1521.0,3.0
4,2 BHK,1200,2.0,1.0,51.0,1200.0,2.0


In [93]:
columns_to_drop_df3 = ['size', 'total_sqft']

In [94]:
df4 = df3. drop(columns = columns_to_drop_df3, errors = 'ignore')

In [95]:
df4.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk
0,2.0,1.0,39.07,1056.0,2.0
1,5.0,3.0,120.0,2600.0,4.0
2,2.0,3.0,62.0,1440.0,3.0
3,3.0,1.0,95.0,1521.0,3.0
4,2.0,1.0,51.0,1200.0,2.0


creating a new feature - "Price per sqaure feet"

In [59]:
# df4['price_per_squarefeet'] = df['price'] * 100000 / df4['total_sqft_int']  

In [60]:
# df4.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk,price_per_squarefeet
0,2.0,1.0,39.07,1056.0,2.0,3699.810606
1,5.0,3.0,120.0,2600.0,4.0,4615.384615
2,2.0,3.0,62.0,1440.0,3.0,4305.555556
3,3.0,1.0,95.0,1521.0,3.0,6245.890861
4,2.0,1.0,51.0,1200.0,2.0,4250.0


In [96]:
df5=df4.dropna()

Defining a function to remove outliers using z-score

In [97]:
lower_threshold = df5['price'].quantile(0.05)
upper_threshold = df5['price'].quantile(0.95)

In [98]:
filtered_data = df5[(df5['price'] >= lower_threshold) & (df5['price'] <= upper_threshold)]

In [99]:
filtered_data.shape

(10932, 5)

In [100]:
df6 = filtered_data

In [101]:
df6.head()

Unnamed: 0,bath,balcony,price,total_sqft_int,bhk
0,2.0,1.0,39.07,1056.0,2.0
1,5.0,3.0,120.0,2600.0,4.0
2,2.0,3.0,62.0,1440.0,3.0
3,3.0,1.0,95.0,1521.0,3.0
4,2.0,1.0,51.0,1200.0,2.0


# Performing zscore normalization - standardization

In [218]:
# from sklearn.preprocessing import MinMaxScaler

In [221]:
# data = df6[['bath', 'balcony', 'total_sqft_int', 'bhk']].values

In [222]:
# scaler = MinMaxScaler()
# normalized_data = scaler.fit_transform(data)

In [223]:
# normalized_df = pd.DataFrame(normalized_data, columns=['bath', 'balcony', 'total_sqft_int', 'bhk'])

In [226]:
# df7=normalized_df

In [276]:
# normalized_df.head()

In [228]:
# y1 = df6['price_per_squarefeet']
# x1 = df6.drop(['price','price_per_squarefeet'], axis = 1)

In [240]:
# df7['price'] = df6['price']

In [275]:
# df7.head()

In [254]:
# df8=df7.dropna()

# Training and Prediction

In [265]:
y1 = df6['price']
x1 = df6.drop(['price'], axis=1 )

In [266]:
print(y1)

0         39.07
1        120.00
2         62.00
3         95.00
4         51.00
          ...  
12706    148.00
12707     56.00
12708    200.00
12709     44.73
12710     45.00
Name: price, Length: 10932, dtype: float64


In [267]:
print(x1)

       bath  balcony  total_sqft_int  bhk
0       2.0      1.0          1056.0  2.0
1       5.0      3.0          2600.0  4.0
2       2.0      3.0          1440.0  3.0
3       3.0      1.0          1521.0  3.0
4       2.0      1.0          1200.0  2.0
...     ...      ...             ...  ...
12706   3.0      2.0          1715.0  3.0
12707   3.0      2.0          3453.0  5.0
12708   3.0      2.0          1141.0  2.0
12709   2.0      2.0          4689.0  4.0
12710   2.0      1.0           550.0  1.0

[10932 rows x 4 columns]


In [268]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(
    x1, y1, test_size=0.2, random_state=42
)

In [269]:
# Fitting the Multiple Linear Regression model
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()  
mlr.fit(X1_train, y1_train)

In [270]:
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(x1, mlr.coef_))

Intercept:  13.622903227412635
Coefficients:


[('bath', 27.986207063224466),
 ('balcony', 2.124455705992556),
 ('total_sqft_int', -0.0011265482124259852),
 ('bhk', 0.47043760557007147)]

In [271]:
#Prediction of test set
y_pred_mlr = mlr.predict(X1_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_mlr))

Prediction for test set: [104.0143465   97.17054177 243.72345182 ...  99.76543508 185.35554341
 131.21468246]


In [272]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y1_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

Unnamed: 0,Actual value,Predicted value
625,100.0,104.014346
4525,85.0,97.170542
1105,145.0,243.723452
8087,45.4,41.448681
5163,80.99,71.297525


In [273]:
from sklearn.metrics import mean_squared_error

In [274]:
mse = mean_squared_error(y1_test, y_pred_mlr)
print("Mean Sqaured Error: ", mse)

Mean Sqaured Error:  1833.3886936288714


# Doing with Ridge and Lasso Regression

In [201]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha = 0.4)

In [202]:
lasso.fit(X1_train, y1_train)

In [203]:
y_pred_lasso = lasso.predict(X1_test)

In [204]:
#Actual value and the predicted value
lasso_diff = pd.DataFrame({'Actual value': y1_test, 'Predicted value': y_pred_lasso})
lasso_diff.head()

Unnamed: 0,Actual value,Predicted value
625,100.0,102.959225
4525,85.0,98.190521
1105,145.0,241.423736
8087,45.4,43.250466
5163,80.99,72.012309


In [205]:
#finding mse for this ridge regression
mse_lasso = mean_squared_error(y1_test, y_pred_lasso)
print("Mean Sqaured Error: ", mse_lasso)

Mean Sqaured Error:  1832.9009941979252


# Random Forest 

In [116]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [117]:
model = RandomForestRegressor()

In [118]:
model.fit(X1_train, y1_train)

In [119]:
y_pred = model.predict(X1_test)

In [120]:
#evaluate the model's performance
mse = mean_squared_error(y1_test,y_pred)

In [121]:
print(mse)

2150.9951965736363


In [122]:
model_diff = pd.DataFrame({'Actual value': y1_test, 'Predicted value': y_pred})
model_diff.head()

Unnamed: 0,Actual value,Predicted value
625,100.0,80.592734
4525,85.0,123.225357
1105,145.0,166.42
8087,45.4,60.6566
5163,80.99,51.22257


# Saving it in a file

In [114]:
import pickle

In [115]:
with open('linear_regression_model.pkl', 'wb') as file:
    pickle.dump(mlr, file)