In [439]:
#Imports
import pandas as pd
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import pickle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.compose import ColumnTransformer 
from sklearn.metrics import r2_score


In [440]:
# read data
df = pd.read_csv("used_cars_data.csv")

In [441]:
df.columns

Index(['Unnamed: 0', 'brand', 'model', 'price (eur)', 'engine', 'year',
       'mileage (kms)', 'fuel', 'gearbox', 'location'],
      dtype='object')

In [442]:
# drop the first column since it is only an index
# also drop the columns that are not used in the webapp/api (not mentioned in the brief)
df = df.drop(columns = ["Unnamed: 0", "engine", "location", "model"])
df.columns

Index(['brand', 'price (eur)', 'year', 'mileage (kms)', 'fuel', 'gearbox'], dtype='object')

In [443]:
df.head()

Unnamed: 0,brand,price (eur),year,mileage (kms),fuel,gearbox
0,SEAT,8990,2016,67000,Gasolina,Manual
1,Hyundai,9990,2014,104868,Diésel,Manual
2,BMW,13490,2011,137566,Diésel,Automatica
3,Volkswagen,24990,2018,44495,Gasolina,Manual
4,Opel,10460,2016,69800,Gasolina,Manual


In [444]:
# encode categorical features and concatenate with num features (same code as before)
enc = OneHotEncoder(handle_unknown='ignore')
X = df[['brand', 'fuel', 'gearbox']]
enc.fit(X)
enc.transform(df[['brand', 'fuel', 'gearbox']]).toarray()
X_features = pd.DataFrame(enc.transform(df[['brand', 'fuel', 'gearbox']]).toarray())
year = datetime.datetime.now().year
df['age'] = year-df['year']
X_num = df[['age', 'mileage (kms)']]
y = df['price (eur)']
X = pd.concat([X_num, X_features], axis=1)
X.head()

Unnamed: 0,age,mileage (kms),0,1,2,3,4,5,6,7,...,33,34,35,36,37,38,39,40,41,42
0,7,67000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,9,104868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,12,137566,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5,44495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,7,69800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [445]:
#split int test train subgroups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# convert columns names int str
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)




In [446]:
#check for outliers in numerical fetures


#get standard dev and means and upper/lower limits
means = []
stds= []
cutoffs = []
lower = {}
upper = {}

# iterate through train input features and get all necessary data
for num_col in X_num.columns:
    means.append(np.mean(X_train[num_col]))
    stds.append(np.std(X_train[num_col]))
    cutoffs.append(3 * np.std(X_train[num_col]))
    lower.update({num_col: np.mean(X_train[num_col]) - (3 * np.std(X_train[num_col]))})
    upper.update({num_col: np.mean(X_train[num_col]) + (3 * np.std(X_train[num_col]))})

#find the outliers by checking if they are above or below the max, min threshold and remove them
outliersInCol = {}
rowsToDrop = []
for num_col in X_num.columns:
    outliers = []
    for i, row in X_train.iterrows():
        if row[num_col] < lower[num_col] or row[num_col] > upper[num_col]:
            outliers.append(row[num_col])
            rowsToDrop.append(i)
                        
    outliersInCol.update({num_col: outliers})


#drop the rows with the outliers    
X_train = X_train.drop(index=rowsToDrop)
y_train = y_train.drop(index=rowsToDrop)

    
print(rowsToDrop)
    

[779, 776, 775, 777, 709, 780]


In [447]:
# create vars for rand forest regresor hyperparmeters
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] #number of trees
max_features = ['log2', 'sqrt'] # consider either square or log2 (n_features)
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # max depth of tree
max_depth.append(None)
min_samples_split = [2, 5, 10] #min sanmples to split leaf
min_samples_leaf = [1, 2, 4] # min samples per leaf 
bootstrap = [True, False] # wether to bootstrap samples or not when building trees
# put them all in var
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [448]:
# create RF Regressor 
rf = RandomForestRegressor()

# use sklearns randomizedSearchCV using the vars previously established to find best hyperparameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# fit model
rf_random.fit(X_train, y_train)

# these are the params used
print(rf_random.best_params_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 90, 'bootstrap': True}


In [449]:
# predict test subgroup
y_pred = rf_random.predict(X_test)

In [450]:
# metric used in brief
errors = mean_squared_error(y_test, y_pred, squared=False)

errors

4312.482517317982

In [451]:
# metric used in brief

errors2 = mean_absolute_error(y_test, y_pred)
errors2

3026.1895164980756

In [452]:
# r2 score (higher is better)
r2_score(y_test, y_pred)


0.5374591880222461