### 1. Import thư viện các thư viện cần thiết:

In [351]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

### 2. Tải bộ dữ liệu:

### 3. Đọc bộ dữ liệu:

In [352]:
data_house = pd.read_csv('./data/Housing.csv')

data_house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


### 4. Xử lý dữ liệu categorical:

In [353]:
categoy_obj_col = data_house.select_dtypes(['object', 'category']).columns

ordinal_encoded = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoded.fit_transform(
    data_house[categoy_obj_col]
).astype(int)

encoded_df = pd.DataFrame(encoded_categorical_cols, columns=categoy_obj_col)
data_house.drop(columns=categoy_obj_col, inplace=True)

data_house = pd.concat([data_house, encoded_df], axis= 1)
data_house

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,1,0,1,0,0,0,2
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,1
542,1750000,3620,2,1,1,0,1,0,0,0,0,0,2
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,0


### 5. Chuẩn hóa bộ dữ liệu:

In [354]:
# scaler = StandardScaler()
# data_house = scaler.fit_transform(data_house)

In [355]:
data_house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0


### 6. Tách dữ liệu X, y:

In [356]:
# scaler trả về 1 matrix nên mình cần dùng slicing để xác định X,y
X, y = data_house.iloc[:, [1]], data_house.iloc[:, 0] 


### 7. Chia tập dữ liệu train, val:

In [357]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y, 
    test_size = 0.3,
    random_state = 1,
    shuffle = True
)

### 8. Huấn luyện mô hình: Random Forest, AdaBoost, Gradient Boosting

In [358]:
test_size = 0.3
random_state = 1
is_shuffle = True

cv_split = KFold(n_splits=5, shuffle=True, random_state=42)

#### Random Forest

In [359]:
parameters = {
    "max_features": [3, 4, 5],
    "n_estimators": [100, 300]
}

# Random Forest model
rf_regressor = RandomForestRegressor()
grid_rf = GridSearchCV(estimator=rf_regressor,
                       cv=cv_split, param_grid=parameters)

grid_rf.fit(x_train, y_train)

#### AdaBoost

In [370]:
# AdaBoost
parameters_ada = {
    "n_estimators": [100, 300],
}
ada_bst_regressor = AdaBoostRegressor(
    random_state = random_state
)
grid_ada_bst = GridSearchCV(estimator=ada_bst_regressor,
                       cv=cv_split, param_grid=parameters)
grid_ada_bst.fit(x_train, y_train)

#### Gradient Boosting

In [374]:
parameters_gb = {
    "learning_rate": [0.01, 0.05],
    "n_estimators": [100, 300],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

gradient_bst_regressor = GradientBoostingRegressor(
    random_state=random_state
)
grid_gradient_bst = GridSearchCV(estimator=gradient_bst_regressor,
                                 cv=cv_split, param_grid=parameters_gb)
grid_gradient_bst.fit(x_train, y_train)

### 9. Đánh giá mô hình: MAE, MSE

In [362]:
y_pred = grid_rf.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print ('Evaluation results on validation set :')
print (f'Mean Absolute Error : {mae}')
print (f'Mean Squared Error : {mse}')

Evaluation results on validation set :
Mean Absolute Error : 1218767.3291907446
Mean Squared Error : 3020597174060.2217


In [373]:
y_pred = grid_ada_bst.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('Evaluation results on validation set :')
print(f'Mean Absolute Error : {mae}')
print(f'Mean Squared Error : {mse}')

Evaluation results on validation set :
Mean Absolute Error : 1143186.5691642133
Mean Squared Error : 2659136603728.2383


In [375]:
y_pred = grid_gradient_bst.predict(x_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('Evaluation results on validation set :')
print(f'Mean Absolute Error : {mae}')
print(f'Mean Squared Error : {mse}')

Evaluation results on validation set :
Mean Absolute Error : 1128777.1885229684
Mean Squared Error : 2575292596119.415


10. A : Bagging - Bootstrap Aggregating