In [23]:
# Import thư viện các thư viện cần thiết

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [3]:
# Tải bộ dữ liệu

!gdown 1qeJqFtRdjjHqExbWJcgKy0yJbczTTAE3

Downloading...
From: https://drive.google.com/uc?id=1qeJqFtRdjjHqExbWJcgKy0yJbczTTAE3
To: /content/Housing.csv
  0% 0.00/30.0k [00:00<?, ?B/s]100% 30.0k/30.0k [00:00<00:00, 40.0MB/s]


In [15]:
# Đọc bộ dữ liệu

dataset_path = './Housing.csv'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [9]:
# Xử lý dữ liệu categorical
## Kiểm tra các cột có kiểu dữ liệu là Object

categorical_cols = df.select_dtypes(include=['object']).columns.to_list()
print(categorical_cols)

['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


In [14]:
## Chuyển đổi chúng thành dạng số

ordinal_encoder = OrdinalEncoder()

encoded_categorical_cols = ordinal_encoder.fit_transform(
    df[categorical_cols]
)

encoded_categorical_df = pd.DataFrame(
    encoded_categorical_cols,
    columns=categorical_cols
)

numerical_df = df.drop(categorical_cols, axis=1)

encoded_df = pd.concat(
    [numerical_df, encoded_categorical_df], axis=1
)
encoded_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,13300000,7420,4,2,3,2,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,12250000,8960,4,4,4,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,12250000,9960,3,2,2,2,1.0,0.0,1.0,0.0,0.0,1.0,1.0
3,12215000,7500,4,2,2,3,1.0,0.0,1.0,0.0,1.0,1.0,0.0
4,11410000,7420,4,1,2,2,1.0,1.0,1.0,0.0,1.0,0.0,0.0


In [17]:
# Chuẩn hóa bộ dữ liệu

normalizer = StandardScaler()
dataset_arr = normalizer.fit_transform(encoded_df)
dataset_arr

array([[ 4.56636513,  1.04672629,  1.40341936, ...,  1.4726183 ,
         1.80494113, -1.40628573],
       [ 4.00448405,  1.75700953,  1.40341936, ...,  1.4726183 ,
        -0.55403469, -1.40628573],
       [ 4.00448405,  2.21823241,  0.04727831, ..., -0.67906259,
         1.80494113, -0.09166185],
       ...,
       [-1.61432675, -0.70592066, -1.30886273, ..., -0.67906259,
        -0.55403469,  1.22296203],
       [-1.61432675, -1.03338891,  0.04727831, ..., -0.67906259,
        -0.55403469, -1.40628573],
       [-1.61432675, -0.5998394 ,  0.04727831, ..., -0.67906259,
        -0.55403469,  1.22296203]])

In [20]:
# Tách dữ liệu X, y

X, y = dataset_arr[:, 1:], dataset_arr[:, 0]
X[:5], y[:5]

(array([[ 1.04672629,  1.40341936,  1.42181174,  1.37821692,  1.51769249,
          0.40562287, -0.46531479, -0.73453933, -0.2192645 ,  1.4726183 ,
          1.80494113, -1.40628573],
        [ 1.75700953,  1.40341936,  5.40580863,  2.53202371,  2.67940935,
          0.40562287, -0.46531479, -0.73453933, -0.2192645 ,  1.4726183 ,
         -0.55403469, -1.40628573],
        [ 2.21823241,  0.04727831,  1.42181174,  0.22441013,  1.51769249,
          0.40562287, -0.46531479,  1.3613975 , -0.2192645 , -0.67906259,
          1.80494113, -0.09166185],
        [ 1.08362412,  1.40341936,  1.42181174,  0.22441013,  2.67940935,
          0.40562287, -0.46531479,  1.3613975 , -0.2192645 ,  1.4726183 ,
          1.80494113, -1.40628573],
        [ 1.04672629,  1.40341936, -0.57018671,  0.22441013,  1.51769249,
          0.40562287,  2.14908276,  1.3613975 , -0.2192645 ,  1.4726183 ,
         -0.55403469, -1.40628573]]),
 array([4.56636513, 4.00448405, 4.00448405, 3.98575468, 3.55497918]))

In [21]:
# Chia tập dữ liệu train, val

test_size = 0.3
random_state = 1
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

In [26]:
# Huấn luyện mô hình

## RandomForest
rf_regressor = RandomForestRegressor(
    random_state=random_state
)
rf_regressor.fit(X_train, y_train)

## AdaBoost
ab_regressor = AdaBoostRegressor(
    random_state=random_state
)
ab_regressor.fit(X_train, y_train)

## GradientBoosting
gb_regressor = GradientBoostingRegressor (
    random_state=random_state
)
gb_regressor.fit(X_train, y_train)

In [28]:
# Đánh giá mô hình

rf_y_pred = rf_regressor.predict(X_val)
ab_y_pred = ab_regressor.predict(X_val)
gb_y_pred = gb_regressor.predict(X_val)

rf_mae = mean_absolute_error(y_val, rf_y_pred)
ab_mae = mean_absolute_error(y_val, ab_y_pred)
gb_mae = mean_absolute_error(y_val, gb_y_pred)

rf_mse = mean_squared_error(y_val, rf_y_pred)
ab_mse = mean_squared_error(y_val, ab_y_pred)
gb_mse = mean_squared_error(y_val, gb_y_pred)

print('RF: Evaluation results on validation set:')
print(f'Mean Absolute Error: {rf_mae}')
print(f'Mean Squared Error: {rf_mse}\n')

print('AB: Evaluation results on validation set:')
print(f'Mean Absolute Error: {ab_mae}')
print(f'Mean Squared Error: {ab_mse}\n')

print('GB: Evaluation results on validation set:')
print(f'Mean Absolute Error: {gb_mae}')
print(f'Mean Squared Error: {gb_mse}')

RF: Evaluation results on validation set:
Mean Absolute Error: 0.46093873321571177
Mean Squared Error: 0.37944418523089524

AB: Evaluation results on validation set:
Mean Absolute Error: 0.567680019897059
Mean Squared Error: 0.5739244030038942

GB: Evaluation results on validation set:
Mean Absolute Error: 0.4516626127750995
Mean Squared Error: 0.39610445936979427
