In [3]:
import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pickle
import lightgbm as lgb
import category_encoders as ce

In [4]:
# Load data
df = pd.read_csv('../data_preprocessed.csv')

In [5]:
df.head()

Unnamed: 0,year,price,assemble_place,series,km,transmission,brand,model,engine_type_Dầu,engine_type_Hybrid,engine_type_Xăng,engine_type_Điện
0,2020,899000000.0,0,1311185000.0,60000,1,598582100.0,1009013000.0,True,False,False,False
1,2018,568000000.0,0,1311185000.0,80000,1,598582100.0,502539800.0,False,False,True,False
2,2022,929000000.0,0,1311185000.0,50000,1,615941100.0,951717500.0,False,False,True,False
3,2022,369000000.0,1,869486500.0,25000,0,460170600.0,377083300.0,False,False,True,False
4,2011,4900000000.0,1,869486500.0,80000,1,1580050000.0,4774253000.0,False,False,True,False


In [6]:
# price is the target variable
df.isna().sum()

year                  0
price                 0
assemble_place        0
series                0
km                    0
transmission          0
brand                 0
model                 0
engine_type_Dầu       0
engine_type_Hybrid    0
engine_type_Xăng      0
engine_type_Điện      0
dtype: int64

In [7]:
X = df.drop('price', axis=1)
y = df['price']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Linear Regression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [10]:
regressor.score(x_test, y_test)

0.8829285000019588

In [11]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100)
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [12]:
from sklearn.model_selection import KFold, RandomizedSearchCV

"""
+ Search over space of hyperparameters to find acceptable ones
+ Two search methods implemented by scikit-learn: GridSearch and RandomSearch
+ RandomSearch is faster in case we search over large space (too many combinations of hyperparameters)
"""

rf_hyperparams = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [None, 10, 50, 90, 110],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_randomized_search = RandomizedSearchCV(
    estimator = rf,
    param_distributions = rf_hyperparams,
    n_iter = 10,
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=2,
    random_state=42,
    n_jobs = -1
)



In [13]:
rf_randomized_search.fit(x_train, y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [14]:
rf_randomized_search.score(x_test, y_test)


0.9659126835047358

In [15]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(x_train, y_train)

y_pred = lgbm.predict(x_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#score
print("score: ",lgbm.score(x_test,y_test))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 594
[LightGBM] [Info] Number of data points in the train set: 9828, number of used features: 11
[LightGBM] [Info] Start training from score 998585372.275132
MAE: 78379770.15077367
MSE: 3.5339621356718e+16
RMSE: 187988354.31142536
score:  0.9619535563455903


In [16]:
#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_pred = gbr.predict(x_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#score
print("score: ",gbr.score(x_test,y_test))

MAE: 108729781.67000897
MSE: 5.775265048112219e+16
RMSE: 240317811.41047826
score:  0.9378238114029727


In [17]:
# XGBoost Regressor
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#score
print("score: ",xgb.score(x_test,y_test))


MAE: 66816129.71265771
MSE: 3.169156733310312e+16
RMSE: 178021255.2845955
score:  0.9658810314154759


In [18]:
# in ra bảng so sánh điểm,MAE,MSE,RMSE của các model
models = ['Linear Regression', 'Random Forest', 'LightGBM', 'Gradient Boosting', 'XGBoost']
scores = [regressor.score(x_test, y_test), rf_randomized_search.score(x_test, y_test), lgbm.score(x_test, y_test), gbr.score(x_test, y_test), xgb.score(x_test, y_test)]
mae = [metrics.mean_absolute_error(y_test, regressor.predict(x_test)), metrics.mean_absolute_error(y_test, rf_randomized_search.predict(x_test)), metrics.mean_absolute_error(y_test, lgbm.predict(x_test)), metrics.mean_absolute_error(y_test, gbr.predict(x_test)), metrics.mean_absolute_error(y_test, xgb.predict(x_test))]
mse = [metrics.mean_squared_error(y_test, regressor.predict(x_test)), metrics.mean_squared_error(y_test, rf_randomized_search.predict(x_test)), metrics.mean_squared_error(y_test, lgbm.predict(x_test)), metrics.mean_squared_error(y_test, gbr.predict(x_test)), metrics.mean_squared_error(y_test, xgb.predict(x_test))]
rmse = [np.sqrt(metrics.mean_squared_error(y_test, regressor.predict(x_test))), np.sqrt(metrics.mean_squared_error(y_test, rf_randomized_search.predict(x_test))), np.sqrt(metrics.mean_squared_error(y_test, lgbm.predict(x_test))), np.sqrt(metrics.mean_squared_error(y_test, gbr.predict(x_test))), np.sqrt(metrics.mean_squared_error(y_test, xgb.predict(x_test)))]
c_df = pd.DataFrame({'Model': models, 'Score': scores, 'MAE': mae, 'MSE': mse, 'RMSE': rmse})
print(c_df)

               Model     Score           MAE           MSE          RMSE
0  Linear Regression  0.882929  1.733950e+08  1.087424e+17  3.297612e+08
1      Random Forest  0.965913  6.541304e+07  3.166217e+16  1.779387e+08
2           LightGBM  0.961954  7.837977e+07  3.533962e+16  1.879884e+08
3  Gradient Boosting  0.937824  1.087298e+08  5.775265e+16  2.403178e+08
4            XGBoost  0.965881  6.681613e+07  3.169157e+16  1.780213e+08


In [19]:
# save model
filename = 'finalized_model.pkl'
pickle.dump(xgb, open(filename, 'wb'))