In [63]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### 1. Data Preparation

In [64]:
data_og = pd.read_csv('clean_data.csv')

In [65]:
data_og.head()

Unnamed: 0,price,area,status,bhk,bathroom,age,location,builder
0,37.49,872,Ready to move,2,,1.0,Sembakkam,MP Developers
1,93.54,1346,Under Construction,3,2.0,,Selaiyur,DAC Promoters
2,151.0,2225,Under Construction,3,,0.0,Mogappair,Casagrand Builder Private Limited
3,49.0,1028,Ready to move,2,2.0,3.0,Ambattur,Dugar Housing Builders
4,42.28,588,Under Construction,2,1.0,0.0,Pallavaram,Radiance Realty Developers India Ltd


In [66]:
data_og.dtypes

price       float64
area          int64
status       object
bhk           int64
bathroom    float64
age         float64
location     object
builder      object
dtype: object

In [67]:
data = data_og

In [68]:
builder_data = data['builder'].value_counts()

In [69]:
builder_data

builder
seller                               484
MC Foundation                        232
Appaswamy Real Estate                109
Propsource Realty Private Limited     79
Radiance Realty Developers            62
                                    ... 
Prop Mart Technologies                 1
S Suresh Kumar                         1
Yadhav constructions real estates      1
MAXWORTH PROPERTIES                    1
Evrostos Properties                    1
Name: count, Length: 135, dtype: int64

In [70]:
remove_builder = builder_data[builder_data<=10]

In [71]:
remove_builder

builder
SP Homes Pvt Ltd                     10
Prasanna                             10
EK Realtors                          10
Baashyaam Group                       9
GJ ESTATES                            9
                                     ..
Prop Mart Technologies                1
S Suresh Kumar                        1
Yadhav constructions real estates     1
MAXWORTH PROPERTIES                   1
Evrostos Properties                   1
Name: count, Length: 79, dtype: int64

In [72]:
data['builder'] = data['builder'].apply(lambda x: 'other' if x in remove_builder else x)

In [73]:
data['builder'].value_counts().head()

builder
seller                               484
other                                359
MC Foundation                        232
Appaswamy Real Estate                109
Propsource Realty Private Limited     79
Name: count, dtype: int64

In [74]:
location_data = data['location'].value_counts()
remove_location = location_data[location_data<10]
data['location'] = data['location'].apply(lambda x: 'other' if x in remove_location else x)
data['location'].value_counts()

location
other                         398
Veppampattu                   149
Pammal                        139
Medavakkam                    111
Sholinganallur                 99
                             ... 
Alwarpet                       11
Vengaivasal                    11
Urapakkam                      10
Thandalam                      10
Ambattur INDUSTRIAL ESTATE     10
Name: count, Length: 63, dtype: int64

In [75]:
data = pd.get_dummies(data_og, drop_first = True)
data.head()

Unnamed: 0,price,area,bhk,bathroom,age,status_Under Construction,location_Alwarpet,location_Ambattur,location_Ambattur INDUSTRIAL ESTATE,location_Anna Nagar,...,builder_The Nest Builder,builder_Urban Tree Infrastructures,builder_VGK Builders Pvt Ltd,builder_VNR Homes,builder_Velan Housing Properties,builder_Vinay Asrani,builder_other,builder_seller,builder_smartassetsindia,builder_viswaraj
0,37.49,872,2,,1.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,93.54,1346,3,2.0,,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,151.0,2225,3,,0.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,49.0,1028,2,2.0,3.0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,42.28,588,2,1.0,0.0,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [76]:
data.isna().sum()

price                          0
area                           0
bhk                            0
bathroom                    1217
age                          891
                            ... 
builder_Vinay Asrani           0
builder_other                  0
builder_seller                 0
builder_smartassetsindia       0
builder_viswaraj               0
Length: 124, dtype: int64

In [77]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant',fill_value=0)

In [78]:
data = pd.DataFrame(imputer.fit_transform(data), columns = data.columns)

In [79]:
data.isna().sum()

price                       0
area                        0
bhk                         0
bathroom                    0
age                         0
                           ..
builder_Vinay Asrani        0
builder_other               0
builder_seller              0
builder_smartassetsindia    0
builder_viswaraj            0
Length: 124, dtype: int64

In [80]:
from scipy import stats

z_score = stats.zscore(data['price'])
threshold = 3

data = data[(z_score<threshold) & (z_score>-threshold)]

In [81]:
data

Unnamed: 0,price,area,bhk,bathroom,age,status_Under Construction,location_Alwarpet,location_Ambattur,location_Ambattur INDUSTRIAL ESTATE,location_Anna Nagar,...,builder_The Nest Builder,builder_Urban Tree Infrastructures,builder_VGK Builders Pvt Ltd,builder_VNR Homes,builder_Velan Housing Properties,builder_Vinay Asrani,builder_other,builder_seller,builder_smartassetsindia,builder_viswaraj
0,37.49,872.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,93.54,1346.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,151.00,2225.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49.00,1028.0,2.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,42.28,588.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2615,27.50,750.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2616,27.50,750.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2617,30.00,800.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2618,31.50,850.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## 2. EDA

In [82]:
#plt.figure(figsize=(8,6))
#sns.pairplot(data)
#plt.show()

In [83]:
#correlation = data.corr()
#sns.heatmap(correlation, annot=True)


## 3. Model Selection

In [84]:
from sklearn.linear_model import RANSACRegressor

In [85]:
model = LinearRegression()

## 4. Training

In [86]:
X = data.drop('price',axis=1)
y = data['price']

X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True)

In [87]:
model.fit(X_train, y_train)

## 5. Testing

In [88]:
X

Unnamed: 0,area,bhk,bathroom,age,status_Under Construction,location_Alwarpet,location_Ambattur,location_Ambattur INDUSTRIAL ESTATE,location_Anna Nagar,location_Ayanambakkam,...,builder_The Nest Builder,builder_Urban Tree Infrastructures,builder_VGK Builders Pvt Ltd,builder_VNR Homes,builder_Velan Housing Properties,builder_Vinay Asrani,builder_other,builder_seller,builder_smartassetsindia,builder_viswaraj
0,872.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1346.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2225.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1028.0,2.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,588.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2615,750.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2616,750.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2617,800.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2618,850.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [89]:
y

0        37.49
1        93.54
2       151.00
3        49.00
4        42.28
         ...  
2615     27.50
2616     27.50
2617     30.00
2618     31.50
2619     34.00
Name: price, Length: 2563, dtype: float64

In [90]:
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

y_preds_1 = model.predict(X_train)
y_preds_2 = model.predict(X_test)

MAE_1 = mean_absolute_error(y_train, y_preds_1)
MSE_1 = mean_squared_error(y_train, y_preds_1)
r2_1 = r2_score(y_train, y_preds_1)

MAE_2 = mean_absolute_error(y_test, y_preds_2)
MSE_2 = mean_squared_error(y_test, y_preds_2)
r2_2 = r2_score(y_test, y_preds_2)

print(f"Train Data: \nMAE: {MAE_1} ||  MSE: {MSE_1} || r2 score: {r2_1}")
print(f"Test Data: \nMAE: {MAE_2} ||  MSE: {MSE_2} || r2 score: {r2_2}")

Train Data: 
MAE: 13.491147405624005 ||  MSE: 588.6307474557208 || r2 score: 0.8827046582219437
Test Data: 
MAE: 12.854498705630204 ||  MSE: 545.3135933542557 || r2 score: 0.912481550717495


In [91]:
X_test

Unnamed: 0,area,bhk,bathroom,age,status_Under Construction,location_Alwarpet,location_Ambattur,location_Ambattur INDUSTRIAL ESTATE,location_Anna Nagar,location_Ayanambakkam,...,builder_The Nest Builder,builder_Urban Tree Infrastructures,builder_VGK Builders Pvt Ltd,builder_VNR Homes,builder_Velan Housing Properties,builder_Vinay Asrani,builder_other,builder_seller,builder_smartassetsindia,builder_viswaraj
1988,655.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
756,1154.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2009,1030.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
945,1353.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1540,1027.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1251,891.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1323,2133.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1070,1242.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1656,960.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 6. Save

In [92]:
import pickle
with open('reg_model.pkl', 'wb') as file:
    pickle.dump(model, file)

## 7. Deploy