In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

In [62]:
houses = pd.read_csv('Real estate.csv')
houses.sample(5)


Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
247,248,2013.333,21.7,1055.067,0,24.96211,121.54928,23.1
236,237,2013.167,3.6,373.8389,10,24.98322,121.53765,61.9
150,151,2013.25,35.8,170.7311,7,24.96719,121.54269,48.5
176,177,2012.833,13.9,4573.779,0,24.94867,121.49507,19.2
307,308,2012.833,10.3,3079.89,0,24.9546,121.56627,24.7


In [63]:
houses.isnull().sum()

No                                        0
X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
Y house price of unit area                0
dtype: int64

In [64]:
lr = LinearRegression()

In [65]:
x_train,x_test,y_train,y_test = train_test_split(houses.iloc[:,0:-1],houses.iloc[:,-1],random_state=False,test_size=0.3)

In [66]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((289, 7), (125, 7), (289,), (125,))

In [68]:
pd.DataFrame(x_train_scaled,columns=x_train.columns)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,0.718968,1.242935,0.799165,-0.607432,0.671889,0.822989,0.542723
1,-0.957715,0.948458,-0.968263,-0.800623,1.701089,0.411624,0.654020
2,1.421819,0.650434,-1.189192,-0.832782,1.358023,-0.834960,0.291322
3,-1.016992,1.242935,0.498702,2.240260,-1.386513,-1.656909,-2.008608
4,0.202414,0.650434,1.974505,-0.609134,1.701089,0.866702,0.406547
...,...,...,...,...,...,...,...
284,0.964542,0.948458,0.993582,-0.716529,0.671889,0.566179,0.736512
285,-0.144778,0.061481,2.336827,-0.826532,1.014956,-0.121511,0.496240
286,-0.779885,-0.531020,-0.331989,2.436867,-1.386513,-2.357869,-1.916951
287,-1.372652,1.537411,1.638693,-0.366832,-0.357312,0.513100,0.264479


In [69]:
x_train

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
294,295,2013.500,26.4,335.52730,6,24.97960,121.54140
96,97,2013.417,6.4,90.45606,9,24.97433,121.54310
377,378,2013.333,3.9,49.66105,8,24.95836,121.53756
89,90,2013.500,23.0,3947.94500,0,24.94783,121.50243
233,234,2013.333,39.7,333.36790,9,24.98016,121.53932
...,...,...,...,...,...,...,...
323,324,2013.417,28.6,197.13380,6,24.97631,121.54436
192,193,2013.167,43.8,57.58945,7,24.96750,121.54069
117,118,2013.000,13.6,4197.34900,0,24.93885,121.50383
47,48,2013.583,35.9,640.73910,3,24.97563,121.53715


In [73]:
lr.fit(x_train,y_train)

In [74]:
y_predicted = lr.predict(x_test)

In [75]:


mae = metrics.mean_absolute_error(y_test, y_predicted)
mse = metrics.mean_squared_error(y_test, y_predicted)
r2 = metrics.r2_score(y_test, y_predicted)

print("The model performance for testing set")
print("--------------------------------------")
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))

The model performance for testing set
--------------------------------------
MAE is 6.1163460929116615
MSE is 73.10682904135055
R2 score is 0.5710302953097175


In [76]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [77]:
lr.fit(x_train_scaled,y_train)

In [81]:
y_predicted = lr.predict(x_test_scaled)

In [82]:


mae = metrics.mean_absolute_error(y_test, y_predicted)
mse = metrics.mean_squared_error(y_test, y_predicted)
r2 = metrics.r2_score(y_test, y_predicted)

print("The model performance for testing set")
print("--------------------------------------")
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))

The model performance for testing set
--------------------------------------
MAE is 6.116346092911044
MSE is 73.10682904134829
R2 score is 0.5710302953097308


In [83]:
y_predicted

array([39.94292224, 13.18485718, 42.04732032, 11.86760772, 42.63287394,
       38.89551748, 40.99138464, 35.80458122, 50.97836212, 45.87299229,
       45.37340388, 42.0888666 , 39.0139959 , 40.16548187, 46.59014898,
       39.42417847, 39.70359128, 42.49295231, 42.58874763, 42.47939411,
       48.95584591, 32.81712211, 39.00042724, 47.02923203, 47.14603912,
       43.10368488, 46.51362552, 26.19190059, 47.52448319, 21.14931617,
       44.19956621, 35.44660237, 44.45246336, 40.17013801, 43.6170064 ,
       31.83625187, 46.84006051, 40.80230864, 53.56831243,  6.98328976,
       51.72824206, 34.44853365, 33.81714307, 48.39341082, 14.95719662,
       44.12067009, 42.31205951, 11.96983065, 35.5242225 , 48.0801493 ,
       52.46920202, 40.11833064, 49.20711533, 25.23017992, 31.67810591,
       33.8762666 , 48.4493841 , 41.42949488, 43.50820576, 33.30331109,
       43.91219907, 44.55600001, 40.56570575, 47.89460143, 37.4677299 ,
       30.42166848, 10.67812145, 36.31252029, 47.96173189, 40.83