# Best Model Selection

Rusty Bargain used car sales service is developing an app to attract new customers. In that app, you can quickly find out the market value of your car. You have access to historical data: technical specifications, trim versions, and prices. You need to build the model to determine the value. 

Rusty Bargain is interested in:

- the quality of the prediction;
- the speed of the prediction;
- the time required for training

In [None]:
import pandas as pd
import numpy as np
import warnings
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

random = 42
warnings.filterwarnings("ignore")

## Data preparation

In [None]:
df = pd.read_csv('/datasets/car_data.csv', parse_dates=['DateCrawled','DateCreated','LastSeen'], nrows=5000)

df.columns = ['date', 'price', 'vehicle_type', 'registration_year', 'gearbox',
       'power', 'model', 'mileage', 'registration_month', 'fuel_type', 'brand',
       'not_repaired', 'created', 'pictures', 'postal','last_seen']

In [None]:
print(df.head(3))
print("------------------------------------------------------------------------------")
print('Present of missing values:')
print(round(100*df.isnull().sum()[df.isnull().sum()>0]/len(df),2))
print("------------------------------------------------------------------------------")
print(df.nunique())
print("------------------------------------------------------------------------------")
print(df.info())

In [None]:
df.hist(bins=30, figsize=(15, 10))

### Study Features:
- 'date' , 'created' and 'last_seen' are irrelevant features for price prediction.
- 'pictures' is 0 in all rows, can be discarded.
- other features looking fine.

### Select relevant features, and define catagorical features list:

In [None]:
irrelevant_features = ['date', 'created', 'last_seen', 'pictures']
cat_cols = ['vehicle_type', 'gearbox', 'model', 'fuel_type', 'brand','not_repaired']

original_df = df.copy()
df = df.drop(irrelevant_features, axis=1)

In [None]:
df.head(3)

### OrdinalEncode all catagorical data in dataset:

In [None]:
encode_df = df.copy()
encoder = OrdinalEncoder()

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    nonulls = np.array(data.dropna())
    impute_reshape = nonulls.reshape(-1,1)
    impute_ordinal = encoder.fit_transform(impute_reshape)
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data 


for columns in cat_cols:
    encode_df[columns]  = encode(encode_df[columns])

In [None]:
encode_df.head(3)

### Split the source data into a training set, a validation set, and a test set:


In [None]:
impute_data = encode_df.copy()

df_train, df_valid = train_test_split(impute_data, test_size=0.3, random_state = random)

features_train = df_train.drop(['price'], axis=1)
target_train = df_train['price']

features_valid = df_valid.drop(['price'], axis=1)
target_valid = df_valid['price']

### KNNImput missing values in dataset than OneHotEncoder them:

In [None]:
imputer = KNNImputer()
features_train = pd.DataFrame(np.round(imputer.fit_transform(features_train)),columns = features_train.columns)
features_valid = pd.DataFrame(np.round(imputer.transform(features_valid)),columns = features_valid.columns)

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

array_hot_encoded = ohe.fit_transform(features_train[cat_cols])
data_hot_encoded = pd.DataFrame(array_hot_encoded, columns = ohe.get_feature_names(cat_cols))
data_other_cols = features_train.drop(columns=cat_cols)
features_train_hot = pd.concat([data_hot_encoded, data_other_cols], axis=1)

array_hot_encoded = ohe.transform(features_valid[cat_cols])
data_hot_encoded = pd.DataFrame(array_hot_encoded, columns = ohe.get_feature_names(cat_cols))
data_other_cols = features_valid.drop(columns=cat_cols)
features_valid_hot = pd.concat([data_hot_encoded, data_other_cols], axis=1)


## Model training
### DecisionTreeRegressor optimization:


In [None]:
%%time
model = DecisionTreeRegressor()
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = random)

# search space
dt_grid = dict()
dt_grid['criterion'] = ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
dt_grid['splitter'] = ['best', 'random']
dt_grid['max_features'] = ['auto', 'sqrt', None, 'log2']
dt_grid['min_samples_split'] = [2, 5, 10, 15 ]
dt_grid['min_samples_leaf'] = [1, 5, 12, 16 ,20, 30]

search = GridSearchCV(model, dt_grid, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
t0= time.time()

result = search.fit(features_train_hot, target_train)
t1= time.time()

yhat = result.predict(features_valid_hot)
t2= time.time()

dtree_params = result.best_params_
dtree_model = result.best_estimator_

dtree_op_RMSE = mean_squared_error(target_valid, yhat)**0.5
dtree_op_score = {'rmse': [dtree_op_RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}
dtree_op_score = pd.DataFrame(dtree_op_score, index = [ 'DecisionTreeRegressorOptim' ])

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

### RandomForestRegressor optimization:

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestRegressor()

rforest_grid = dict()
rforest_grid['n_estimators'] = list(range(50, 500, 50))
rforest_grid['criterion'] = ['squared_error', 'absolute_error', 'poisson']
rforest_grid['max_depth'] = list(range(10, 110, 10))
rforest_grid['min_samples_split'] = [2, 5, 10, 20, 30, 40]
rforest_grid['min_samples_leaf'] = [1, 2, 7, 12, 14, 16 ,20, 30]
rforest_grid['max_features'] = ['auto', 'sqrt', None, 'log2']
rforest_grid['bootstrap'] = [True, False]


rf_random = RandomizedSearchCV(estimator = model,
                               scoring='neg_root_mean_squared_error',
                               param_distributions = rforest_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state = random, 
                               n_jobs = -1)

t0= time.time()
result = rf_random.fit(features_train_hot,target_train)

t1= time.time()
yhat = result.predict(features_valid_hot)

t2= time.time()
rforest_params = result.best_params_
rforest_model = result.best_estimator_

rforest_op_RMSE = mean_squared_error(target_valid, yhat)**0.5
rforest_op_score = {'rmse': [rforest_op_RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}
rforest_op_score = pd.DataFrame(rforest_op_score, index = [ 'RandomForestRegressorOptim' ])

print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

### CatBoost:

In [None]:
cat_train, cat_valid = train_test_split(df.dropna(), test_size=0.3, random_state = random)

cat_features_train = cat_train.drop(['price'], axis=1)
cat_target_train = cat_train['price']

cat_features_valid = cat_valid.drop(['price'], axis=1)
cat_target_valid = cat_valid['price']

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function="RMSE", iterations=50)
t0= time.time()

model.fit(cat_features_train, cat_target_train, cat_features = cat_cols, verbose=10) 
t1= time.time()

pred_cat = model.predict(cat_features_valid) 
t2= time.time()

cat_RMSE = mean_squared_error(cat_target_valid, pred_cat)**0.5
cat_score = {'rmse': [cat_RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}
cat_score = pd.DataFrame(cat_score, index = [ 'CatBoostRegressor' ])

### LightGBM:

In [None]:
gbm_df = df.copy()
#gbm_df = gbm_df.drop(['date', 'created', 'last_seen'], axis=1)

for feature in cat_cols:
    gbm_df[feature] = pd.Series(gbm_df[feature], dtype="category")
    
gbm_train, gbm_valid = train_test_split(gbm_df, test_size=0.3, random_state = random)

gbm_features_train = gbm_train.drop(['price'], axis=1)
gbm_target_train = gbm_train['price']

gbm_features_valid = gbm_valid.drop(['price'], axis=1)
gbm_target_valid = gbm_valid['price']

In [None]:
import lightgbm as lgb
t0= time.time()

# create dataset for lightgbm
lgb_train = lgb.Dataset(gbm_features_train, gbm_target_train)
lgb_eval = lgb.Dataset(gbm_features_valid, gbm_target_valid, reference=lgb_train)

# specify configuration
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                categorical_feature = cat_cols,
                num_boost_round=20,
                valid_sets = lgb_eval,
                callbacks = [lgb.early_stopping(stopping_rounds=5)])

t1= time.time()
y_pred = gbm.predict(gbm_features_valid, num_iteration = gbm.best_iteration)
t2= time.time()

lgbm_RMSE = mean_squared_error(gbm_target_valid, y_pred)**0.5
lgbm_score = {'rmse': [lgbm_RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}
lgbm_score = pd.DataFrame(lgbm_score, index = [ 'LightGBM' ])

### XGBoost:

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor()
t0= time.time()

model.fit(features_train_hot,target_train)
t1= time.time()

yhat = model.predict(features_valid_hot)
t2= time.time()

xgboost_RMSE = mean_squared_error(target_valid, yhat)**0.5
xgboost_score = {'rmse': [xgboost_RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}
xgboost_score = pd.DataFrame(xgboost_score, index = [ 'XGBRegressor' ])

## Model analysis
### Create model evaluation function:

In [None]:
def model_evaluation( model, name = ""):
    t0= time.time()
    
    model.fit(features_train_hot,target_train)
    t1= time.time()

    predictions = model.predict(features_valid_hot)
    t2= time.time()

    RMSE = mean_squared_error(target_valid, predictions)**0.5
   
    score= {'rmse': [RMSE], 'training_time': [t1 - t0 ],'prediction_time': [t2 - t1 ]}  

    return pd.DataFrame(score, index = [str(model).split('(')[0] + name])

### Analyze the speed and quality of the models: 

In [None]:
model_score_ev = model_evaluation(LinearRegression())
model_score_ev = model_score_ev.append(model_evaluation(DecisionTreeRegressor()))
model_score_ev = model_score_ev.append(model_evaluation(RandomForestRegressor()))
model_score_ev = model_score_ev.append(rforest_op_score)
model_score_ev = model_score_ev.append(dtree_op_score)
model_score_ev = model_score_ev.append(cat_score)
model_score_ev = model_score_ev.append(lgbm_score)
model_score_ev = model_score_ev.append(xgboost_score)

* The difference between models acuracy is small, but the difference in time is large (somtimes in a 10^3 scale), so to mitigate the scale difference I chose 90% and 10% as weights KPI.

In [None]:
model_score_ev['total_time'] = model_score_ev['training_time'] + model_score_ev['prediction_time']
model_score_ev['kpi'] = 0.9*model_score_ev['rmse']/model_score_ev.loc[['LinearRegression']].rmse.values + 0.1*model_score_ev['total_time']/model_score_ev.loc[['LinearRegression']].total_time.values

model_score_ev.sort_values(by = 'rmse').round(2)

## Conclusions:
* CatBoostRegressor is the most precise model for this dataset but it is also very slow.
* DecisionTreeRegressor is the fastest model for this dataset.
* Taking to account speed and precision CatBoostRegressor will be our best choice.