In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

<h1 style='color:blue;'><b>TRAIN AND TEST DATASET PREPARATION</b></h1>

In [2]:
df = pd.read_csv('cleaned_house_price.csv', index_col=0)
df.head(2)

Unnamed: 0,Gia,Nha_ve_sinh,So_phong,Huong,Chieu_dai,Rong,Giay_to,Dien_tich,Dien_tich_su_dung,Hem_rong,...,Gan_cho,Gan_truong,Gan_congvien,Gan_mat_tien,Di_chuyen,Thoang,an_ninh,2_duong_chinh,2_hem,binned
0,3700,4,3,Đ.Bắc,20.71,5.0,Sổ hồng,104.5,134.7,5.0,...,1,1,1,1,1,1,1,0,0,"(900.0, 4000.0]"
1,3800,2,2,Đ.Bắc,14.35,5.0,Sổ hồng,71.8,137.6,5.0,...,1,1,1,0,1,1,1,0,0,"(900.0, 4000.0]"


In [3]:
y = df['Gia']
X = df[[i for i in df.columns if i != 'Gia']]

In [4]:
categories = [i for i in X.columns if X[i].dtype =='object']
categories

['Huong', 'Giay_to', 'Duong', 'Phuong', 'Quan', 'binned']

In [5]:
# Features
X = pd.get_dummies(X, columns=categories)
X.head(2)

Unnamed: 0,Nha_ve_sinh,So_phong,Chieu_dai,Rong,Dien_tich,Dien_tich_su_dung,Hem_rong,Duong_mat_tien,So_lau,Tang_thuong,...,Quan_Quận Tân Phú,"binned_(10000, 20000]","binned_(10000.0, 20000.0]","binned_(20000, 50000]","binned_(20000.0, 50000.0]","binned_(4000, 10000]","binned_(4000.0, 10000.0]","binned_(50000.0, 101000.0]","binned_(900, 4000]","binned_(900.0, 4000.0]"
0,4,3,20.71,5.0,104.5,134.7,5.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,2,14.35,5.0,71.8,137.6,5.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
import numpy as np
X_log, X_scale = X, X
y_log, y_scale = y, y

In [34]:
X_log = np.log(X_log)

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


In [7]:
y_log = np.log(y)

In [9]:
num = ['Chieu_dai', 'Rong', 'Dien_tich', 'Dien_tich_su_dung']
num

['Chieu_dai', 'Rong', 'Dien_tich', 'Dien_tich_su_dung']

In [10]:
rb = RobustScaler()
rb.fit(X_scale[num])
X_scale[num] = rb.transform(X[num])

In [13]:
X_scale = X_scale[['Dien_tich', 'Dien_tich_su_dung', 'Nha_ve_sinh', 'Hem_rong', 'So_lau',
       'Chieu_dai', 'So_phong', 'Quan_Huyện Nhà Bè', 'Rong', 'Duong_mat_tien',
       'Quan_Quận 10']]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_scale_train, X_scale_test, Y_log_train, Y_log_test =train_test_split(X_scale, y_log, test_size = 0.3, random_state = 42)

In [35]:
X_log_train, X_log_test, Y_log_train, Y_log_test =train_test_split(X_scale, y_log, test_size = 0.3, random_state = 42)

<h1 style='color:blue;'><b>MODELLING</b></h1>

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [29]:
import datetime

### Choose best model

In [31]:
# Chọn model tốt nhất
models = [
    LinearRegression(),
    RidgeCV(),
    LassoCV(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    RandomForestRegressor(n_estimators=200),
    SVR(kernel='linear'),
    XGBRegressor(),
    GaussianProcessRegressor()]

CV = 10
entries=[]
for model in models:
    scores_train = []
    scores_test = []
    abs_scores = []
    time_scores = []
    for j in range(CV):
        t1 = datetime.datetime.now()
        model.fit(X_scale_train, Y_log_train)
        t2 = datetime.datetime.now()
        
        model_name = model.__class__.__name__
        s_train = model.score(X_scale_train, Y_log_train)
        s_test = model.score(X_scale_test, Y_log_test)
        scores_train.append(s_train)
        scores_test.append(s_test)
        abs_scores.append(abs(s_train-s_test))
        time_scores.append(round(((t2-t1).microseconds/1000),1))
    entries.append([model_name, np.array(scores_train).mean(), 
                    np.array(scores_test).mean(), np.array(abs_scores).mean(), 
                    np.array(time_scores).mean()])

cv_df=pd.DataFrame(entries, columns=['model_name', 'scores_train_mean', 'scores_test_mean', 'abs_mean', 'time_mean'])

In [32]:
cv_df

Unnamed: 0,model_name,scores_train_mean,scores_test_mean,abs_mean,time_mean
0,LinearRegression,0.675776,0.644783,0.030993,1.9
1,RidgeCV,0.675773,0.644812,0.030961,3.25
2,LassoCV,0.67556,0.644988,0.030572,45.46
3,KNeighborsRegressor,0.746485,0.619597,0.126888,9.51
4,DecisionTreeRegressor,0.999997,0.44783,0.552168,12.3
5,RandomForestRegressor,0.958709,0.710423,0.248285,750.62
6,RandomForestRegressor,0.959655,0.712931,0.246724,528.81
7,SVR,0.672175,0.63451,0.037665,689.46
8,XGBRegressor,0.961824,0.682292,0.279532,192.97
9,GaussianProcessRegressor,0.999997,-166.762857,167.762855,515.9


In [36]:
# Chọn model tốt nhất
models = [
    LinearRegression(),
    RidgeCV(),
    LassoCV(),
    KNeighborsRegressor(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    RandomForestRegressor(n_estimators=200),
    SVR(kernel='linear'),
    XGBRegressor(),
    GaussianProcessRegressor()]

CV = 10
entries=[]
for model in models:
    scores_train = []
    scores_test = []
    abs_scores = []
    time_scores = []
    for j in range(CV):
        t1 = datetime.datetime.now()
        model.fit(X_scale_train, Y_log_train)
        t2 = datetime.datetime.now()
        
        model_name = model.__class__.__name__
        s_train = model.score(X_log_train, Y_log_train)
        s_test = model.score(X_log_test, Y_log_test)
        scores_train.append(s_train)
        scores_test.append(s_test)
        abs_scores.append(abs(s_train-s_test))
        time_scores.append(round(((t2-t1).microseconds/1000),1))
    entries.append([model_name, np.array(scores_train).mean(), 
                    np.array(scores_test).mean(), np.array(abs_scores).mean(), 
                    np.array(time_scores).mean()])

cv_df=pd.DataFrame(entries, columns=['model_name', 'scores_train_mean', 'scores_test_mean', 'abs_mean', 'time_mean'])

In [38]:
cv_df

Unnamed: 0,model_name,scores_train_mean,scores_test_mean,abs_mean,time_mean
0,LinearRegression,0.675776,0.644783,0.030993,2.29
1,RidgeCV,0.675773,0.644812,0.030961,5.9
2,LassoCV,0.67556,0.644988,0.030572,46.23
3,KNeighborsRegressor,0.746485,0.619597,0.126888,5.7
4,DecisionTreeRegressor,0.999997,0.453031,0.546966,11.8
5,RandomForestRegressor,0.958892,0.710057,0.248834,765.04
6,RandomForestRegressor,0.959559,0.713107,0.246452,528.67
7,SVR,0.672175,0.63451,0.037665,634.37
8,XGBRegressor,0.961824,0.682292,0.279532,193.95
9,GaussianProcessRegressor,0.999997,-166.762857,167.762855,494.01


### Use gridsearch and pipeline

In [None]:
# def Poly_reg

In [41]:
parameters = {'n_estimators': np.arange(50,400,50),
              'max_depth': np.arange(1,5)}
random_forest_reg = RandomForestRegressor(random_state=42)
rg  = GridSearchCV(random_forest_reg, parameters)
rg.fit(X_scale_train, Y_log_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': array([1, 2, 3, 4]),
                         'n_estimators': array([ 50, 100, 150, 200, 250, 300, 350])})

In [42]:
print('best param:' ,rg.best_params_)
print('best estimator:', rg.best_estimator_)
print('best score:', rg.best_score_)

best param: {'max_depth': 4, 'n_estimators': 150}
best estimator: RandomForestRegressor(max_depth=4, n_estimators=150, random_state=42)
best score: 0.6297594986397943


In [53]:
model = RandomForestRegressor()

In [55]:
from sklearn.metrics import mean_squared_error
model.fit(X_scale_train, Y_log_train)
y_pred = model.predict(X_scale_test)
acc = mean_squared_error(Y_log_test, y_pred)
print(acc)
print(model.score(X_scale_train, Y_log_train))
print(model.score(X_scale_test, Y_log_test))
# print(model.score(X_log, y_log))

0.13140142200802743
0.9597853511582242
0.7113346311575226


In [57]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
regress = Ridge()

cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

poly_model = Pipeline(steps=[('poly', poly),('regress', regress)])

param_grid = {'poly__degree': np.arange(10), 'regress__alpha':np.logspace(-2,2,5)}

poly_grid = GridSearchCV(poly_model, param_grid, cv=cv,scoring='explained_variance' )
poly_grid.fit(X_log_train, Y_log_train)

poly_grid.best_params_

Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\pipeline.py", line 307, in _fit
    **fit_params_steps[name])
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Admin\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 

{'poly__degree': 2, 'regress__alpha': 0.01}

In [66]:
lm = LinearRegression()
lm.fit(X_log_train, Y_log_train)
print(lm.score(X_log_train, Y_log_train))
print(lm.score(X_log_test, Y_log_test))

0.6757755606355451
0.6447825385366732


In [68]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_log_train)
X_test_poly = poly.fit_transform(X_log_test)

pm = LinearRegression()
pm.fit(X_train_poly, Y_log_train)

LinearRegression()

In [69]:
poly.get_params

<bound method BaseEstimator.get_params of PolynomialFeatures(include_bias=False)>

In [72]:
pm.score(X_train_poly, Y_log_train)

0.7303889755300451

In [73]:
pm.score(X_test_poly, Y_log_test)

0.6061844986468856

<h1 style='color:blue;'><b>CONCLUSION</b></h1>

We try 3 approaches:
- Run all possible models to compare scores and duration of perfromance. In this experiment, we can see Random Forest models is the best choice with highest scores 0.96 on training set and 0.71 on test set.
- But above experiment have it weakness. It's lack of Polynomial models, linear regression and Ridge have just try in degree of 1. So we try Polynomial models and tuning various alpha and degree. Then, the best params return is degree of 2 and alpha of 0.01. When we apply Linear model with these params, the result is also not good enough compare to Random Forest.
- Finally, We try GridSearch for Random Forest to tuning numbers of estimators and max_depth, the result recommend we use n_estimators of 150 and max_depth of 5. But we try to apply model again and found out that the default Random Forest model is the best choice.