#### Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn import metrics

from joblib import dump, load

%matplotlib inline

#### Checkout the data

In [4]:
df = pd.read_csv('Real_estate.csv')

print(df.head())
print(df.info())

print("The Dataset has",df.shape[0],"Rows")

print("The Dataset has",df.shape[1],"Columns")

   No  X1 transaction date  X2 house age  \
0   1             2012.917          32.0   
1   2             2012.917          19.5   
2   3             2013.583          13.3   
3   4             2013.500          13.3   
4   5             2012.833           5.0   

   X3 distance to the nearest MRT station  X4 number of convenience stores  \
0                                84.87882                               10   
1                               306.59470                                9   
2                               561.98450                                5   
3                               561.98450                                5   
4                               390.56840                                5   

   X5 latitude  X6 longitude  Y house price of unit area  
0     24.98298     121.54024                        37.9  
1     24.98034     121.53951                        42.2  
2     24.98746     121.54391                        47.3  
3     24.98746     121.54391  

In [5]:
df.corr()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
No,1.0,-0.048658,-0.032808,-0.013573,-0.012699,-0.01011,-0.011059,-0.028587
X1 transaction date,-0.048658,1.0,0.017549,0.06088,0.009635,0.035058,-0.041082,0.087491
X2 house age,-0.032808,0.017549,1.0,0.025622,0.049593,0.05442,-0.04852,-0.210567
X3 distance to the nearest MRT station,-0.013573,0.06088,0.025622,1.0,-0.602519,-0.591067,-0.806317,-0.673613
X4 number of convenience stores,-0.012699,0.009635,0.049593,-0.602519,1.0,0.444143,0.449099,0.571005
X5 latitude,-0.01011,0.035058,0.05442,-0.591067,0.444143,1.0,0.412924,0.546307
X6 longitude,-0.011059,-0.041082,-0.04852,-0.806317,0.449099,0.412924,1.0,0.523287
Y house price of unit area,-0.028587,0.087491,-0.210567,-0.673613,0.571005,0.546307,0.523287,1.0


#### Định nghĩa X và y

In [6]:
X=df.drop('Y house price of unit area', axis=1)

y=df['Y house price of unit area']

#### Preprocessing

In [7]:
polynomial_converter=PolynomialFeatures(degree=2, include_bias=False)

In [8]:
poly_features=polynomial_converter.fit(X)

In [9]:
poly_features=polynomial_converter.transform(X)

In [10]:
poly_features.shape

(414, 35)

In [11]:
X.shape

(414, 7)

#### Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    poly_features, y, test_size=0.3, random_state=101)

#### Training a Polynomial Regression Model

In [13]:
polymodel=LinearRegression()

In [14]:
polymodel.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


#### Predicting Test Data

In [15]:
y_pred=polymodel.predict(X_test)

In [16]:
pd.DataFrame({'Y_Test': y_test,'Y_Pred':y_pred, 'Residuals':(y_test-y_pred) }).head

<bound method NDFrame.head of      Y_Test     Y_Pred  Residuals
176    19.2  16.682017   2.517983
347    11.2   5.055974   6.144026
307    24.7  20.453663   4.246337
299    46.1  47.809925  -1.709925
391    31.3  28.199328   3.100672
..      ...        ...        ...
16     70.1  62.142490   7.957510
31     25.0  36.275178 -11.275178
36     22.9  22.670020   0.229980
41     18.2  20.074401  -1.874401
24     38.8  38.557545   0.242455

[125 rows x 3 columns]>

#### Evaluating the Model

In [17]:
MAE_Poly = metrics.mean_absolute_error(y_test,y_pred)
MSE_Poly = metrics.mean_squared_error(y_test,y_pred)
RMSE_Poly = np.sqrt(MSE_Poly)

pd.DataFrame([MAE_Poly, MSE_Poly, RMSE_Poly],
             index=['MAE', 'MSE', 'RMSE'], columns=['metrics'])

Unnamed: 0,metrics
MAE,4.465777
MSE,31.56827
RMSE,5.618565


#### Compare to the simple linear regression

In [18]:
XS_train, XS_test, ys_train, ys_test = train_test_split(X, y, test_size=0.3, random_state=101)
simplemodel=LinearRegression()
simplemodel.fit(XS_train, ys_train)
ys_pred=simplemodel.predict(XS_test)

MAE_simple = metrics.mean_absolute_error(ys_test,ys_pred)
MSE_simple = metrics.mean_squared_error(ys_test,ys_pred)
RMSE_simple = np.sqrt(MSE_simple)

In [19]:
pd.DataFrame({'Poly Metrics': [MAE_Poly, MSE_Poly, RMSE_Poly], 'Simple Metrics':[MAE_simple, MSE_simple,
                                                                                 RMSE_simple]}, index=['MAE', 'MSE', 'RMSE'])

Unnamed: 0,Poly Metrics,Simple Metrics
MAE,4.465777,5.373025
MSE,31.56827,45.880307
RMSE,5.618565,6.7735
