In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv(r'C:\Users\Lenovo\Downloads\car.csv')
#df.shape

Data Preprocessing

In [2]:
df["model"]=df["model"].values.astype('str')
df["origin"]=df["origin"].values.astype('str')
df["brand"]=df["brand"].values.astype('str')
df["gearbox"]=df["gearbox"].values.astype('str')
df["type"]=df["type"].values.astype('str')
df["fuel"]=df["fuel"].values.astype('str')
df_new = df.drop_duplicates()
df_used = df_new[df_new['condition'] == 'used']
df_brand = df_used[df_used.brand.isin(['Toyota', 'Ford', 'Kia', 'Huyndai', 'Mitsubishi',
                                      'Mercedes Benz', 'Mazda', 'Honda', 'Chevrolet', 'Vinfast',
                                      'Suzuki', 'BMW', 'Nissan', 'Daewoo', 'Lexus', 'Peugeot'])]
df_nonnull = df_brand.dropna(how = 'any', axis = 0)
df_date = df_nonnull[df_nonnull['manufacture_date'] > 0]
df_odo = df_date[df_nonnull['mileage_v2'] > 20]
df_final = df_date[df_date['condition'] == 'used']
df_end = df_final[df_final['type'].str.contains("nan") == False]
df_end = df_end[df_end['origin'].str.contains("nan") == False]
df_end = df_end[df_end['gearbox'].str.contains("nan") == False]
df_end = df_end[df_end['model'].str.contains("nan") == False]
df_end = df_end[df_end['model'].str.contains("Dòng khác") == False]
inp = df_end.drop(['id', 'list_id', 'list_time', 'condition', 'color'], axis = 1)

value_counts = inp['model'].value_counts()
values_to_exclude = value_counts[value_counts == 1].index.tolist()
inp = inp[~inp['model'].isin(values_to_exclude)]

In [4]:
X = inp.drop(['price'], axis = 1)
y = inp['price']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 99)

Label Encoding

In [6]:
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

le = {}

for col in categorical_cols:
    le[col] = LabelEncoder()
    X_train[col] = le[col].fit_transform(X_train[col])


Training

In [7]:
rfr = RandomForestRegressor(n_estimators= 10000, n_jobs = 7)

rfr.fit(X_train,y_train)

Testing

In [8]:
for col in categorical_cols:
    X_test[col] = le[col].transform(X_test[col])
y_pred = rfr.predict(X_test)

Evaluating via various metrics

In [9]:
from sklearn.metrics import mean_absolute_error
rate = mean_absolute_error(y_test, y_pred)
rate

36126463.97453568

In [10]:
#Calculating % of cars fitting in +-5% range of desired value
control = (np.divide(y_test,y_pred) - 1)*100
tem = ((-5 < control) & (control < 5)).sum()
percentage = tem/(len(control))*100
percentage

73.11542991755006

Saving Model and Encoder

In [None]:
import pickle
with open('model.pkl', 'wb') as model_file:
    pickle.dump(rfr, model_file)
    model_file.close()
with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(le, encoder_file)
    encoder_file.close()