In [356]:
import copy
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pickle
import lightgbm as lgb
from category_encoders import TargetEncoder, JamesSteinEncoder

In [357]:
df = pd.read_csv('../data/data.csv')

In [358]:
df.price.describe()

count    1.253600e+04
mean     1.162871e+09
std      1.650832e+09
min      2.600000e+07
25%      4.700000e+08
50%      6.660000e+08
75%      1.098250e+09
max      4.380000e+10
Name: price, dtype: float64

# Data Preprocessing for Old car data


# Loại bỏ các ngoại lệ 

In [359]:
price_std = df['price'].std()
price_mean = df['price'].mean()
lower_bound = price_mean - 3 * price_std
upper_bound = price_mean + 3 * price_std
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

In [360]:
df['price'].describe()

count    1.229200e+04
mean     9.959396e+08
std      9.743532e+08
min      2.600000e+07
25%      4.680000e+08
50%      6.580000e+08
75%      1.040000e+09
max      6.088000e+09
Name: price, dtype: float64

In [361]:
df = df[df['km'] >= 100]


In [362]:
current_year = pd.to_datetime('today').year
df = df[df['year'] >= (current_year - 15)]

# Preprocessing for categorical features


In [363]:
# df['brand'] = df['brand'].str.lower()
# df['model'] = df['model'].str.lower()

In [364]:
df['brand'].value_counts()

brand
toyota           2353
mercedes         1494
hyundai          1380
kia              1117
ford             1067
mazda            1032
mitsubishi        486
honda             482
lexus             455
vinfast           453
bmw               255
porsche           200
landrover         172
peugeot           149
chevrolet         135
audi              107
suzuki            102
nissan             93
mg                 86
mercedes-benz      38
volkswagen         36
other              35
mini               33
volvo              21
daewoo             20
jeep               16
cadillac           13
maserati           13
isuzu              11
infiniti           11
jaguar             10
bentley             6
land                5
rolls               3
Name: count, dtype: int64

In [365]:
# sử dụng target encoding cho brand và JamesSteinEncoder cho model

In [366]:
df.isna().sum()


car_name          0
year              0
price             0
assemble_place    0
series            0
km                0
engine_type       0
transmission      0
url               0
brand             0
model             1
dtype: int64

In [367]:
target_enc = TargetEncoder()
js_enc = JamesSteinEncoder()

df['brand'] = target_enc.fit_transform(df['brand'], df['price'])
df['model'] = js_enc.fit_transform(df['model'], df['price'])
df['series'] = target_enc.fit_transform(df['series'], df['price'])

In [368]:
df.isna().sum()


car_name          0
year              0
price             0
assemble_place    0
series            0
km                0
engine_type       0
transmission      0
url               0
brand             0
model             0
dtype: int64

In [369]:
df.head()

Unnamed: 0,car_name,year,price,assemble_place,series,km,engine_type,transmission,url,brand,model
0,Kia Sorento Signature 2.2 AT AWD,2020.0,899000000.0,Lắp ráp trong nước,1340081000.0,60000,Dầu,Số tự động,https://bonbanh.com/xe-kia-sorento-signature-2...,602770800.0,1009069000.0
1,Kia Sorento GATH,2018.0,568000000.0,Lắp ráp trong nước,1340081000.0,80000,Xăng,Số tự động,https://bonbanh.com/xe-kia-sorento-gath-2018-5...,602770800.0,502559400.0
2,Mazda CX8 Premium AWD,2022.0,929000000.0,Lắp ráp trong nước,1340081000.0,50000,Xăng,Số tự động,https://bonbanh.com/xe-mazda-cx8-premium-awd-2...,617414700.0,951902700.0
3,Nissan Almera 1.0 MT,2022.0,369000000.0,Nhập khẩu,890267500.0,25000,Xăng,Số tay,https://bonbanh.com/xe-nissan-almera-1.0-mt-20...,475666900.0,377085200.0
4,Bentley Mulsanne 6.75 V8,2011.0,4900000000.0,Nhập khẩu,890267500.0,80000,Xăng,Số tự động,https://bonbanh.com/xe-bentley-mulsanne-6.75-v...,1625612000.0,4775661000.0


In [370]:
# one hot encoder cho các cột còn lại

In [371]:
one_hot = OneHotEncoder()

In [372]:
df['assemble_place'] = df['assemble_place'].map({'Nhập khẩu': 1, 'Lắp ráp trong nước': 0})

In [373]:
one_hot = OneHotEncoder()
one_hot.fit(df[['engine_type']])
one_hot_engine_type = pd.get_dummies(df['engine_type'], prefix='engine_type')
df = pd.concat([df, one_hot_engine_type], axis=1)

In [374]:
df.isna().sum()

car_name              0
year                  0
price                 0
assemble_place        0
series                0
km                    0
engine_type           0
transmission          0
url                   0
brand                 0
model                 0
engine_type_Dầu       0
engine_type_Hybrid    0
engine_type_Xăng      0
engine_type_Điện      0
dtype: int64

In [375]:
df['transmission'] = df['transmission'].map({'Số tự động': 1, 'Số tay': 0})

In [376]:
#dump các giá trị không cần

In [377]:
df = df.drop(['engine_type','car_name','url'], axis=1)

In [378]:
df.head()

Unnamed: 0,year,price,assemble_place,series,km,transmission,brand,model,engine_type_Dầu,engine_type_Hybrid,engine_type_Xăng,engine_type_Điện
0,2020.0,899000000.0,0,1340081000.0,60000,1,602770800.0,1009069000.0,True,False,False,False
1,2018.0,568000000.0,0,1340081000.0,80000,1,602770800.0,502559400.0,False,False,True,False
2,2022.0,929000000.0,0,1340081000.0,50000,1,617414700.0,951902700.0,False,False,True,False
3,2022.0,369000000.0,1,890267500.0,25000,0,475666900.0,377085200.0,False,False,True,False
4,2011.0,4900000000.0,1,890267500.0,80000,1,1625612000.0,4775661000.0,False,False,True,False


In [379]:
# save to csv

In [380]:
df.to_csv('../data/data_preprocessed.csv', index=False)