In [2]:
import pandas as pd
import numpy as np

def import_data():
    data = pd.read_csv('norway_new_car_sales_by_make.csv')
    data['Period'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)
    df=pd.pivot_table(data=data, values='Quantity', index='Make', columns='Period', aggfunc='sum', fill_value=0)
    return df

In [3]:
def datasets(df, x_len=12, y_len=1, test_loops=12):
    
    D = df.values
    rows, periods = D.shape
    
    # Training set creation
    loops = periods + 1 - x_len - y_len
    train = []
    
    for col in range(loops):
        train.append(D[:,col:col+x_len+y_len])
    train = np.vstack(train)
    X_train, Y_train = np.split(train,[-y_len], axis=1)
    
    # Test set creation
    
    if test_loops>0:
        X_train, X_test = np.split(X_train, [-rows*test_loops], axis=0)
        Y_train, Y_test = np.split(Y_train, [-rows*test_loops], axis=0)
    else:  # No test set: X_test is ised to generate the future forecast
        X_test = D[:,-x_len:]
        Y_test = np.full((X_test.shape[0],y_len),np.nan)
        
    # Formatting required for scikit-learn
    
    if y_len == 1:
        Y_train = Y_train.ravel()
        Y_test = Y_test.ravel()
        
    return X_train, Y_train, X_test, Y_test
         

In [4]:
df = import_data()
X_train, Y_train, X_test, Y_test = datasets(df, x_len=12, y_len=1, test_loops=12)
df.to_excel('demand.xlsx', index=False)


In [5]:
def kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name=''):
    df = pd.DataFrame(columns=['MAE', 'RMSE', 'Bias'], index = ['Train', 'Test'])
    df.index.name=name
    df.loc['Train', 'MAE'] = 100*np.mean(abs(Y_train-Y_train_pred))/np.mean(Y_train)
    df.loc['Train', 'RMSE'] = 100*np.sqrt(np.mean((Y_train-Y_train_pred)**2))/np.mean(Y_train)
    df.loc['Train', 'Bias'] = 100*np.mean((Y_train-Y_train_pred))/np.mean(Y_train)
    df.loc['Test', 'MAE'] = 100*np.mean(abs(Y_test-Y_test_pred))/np.mean(Y_test)
    df.loc['Test', 'RMSE'] = 100*np.sqrt(np.mean((Y_test-Y_test_pred)**2))/np.mean(Y_test)
    df.loc['Test', 'Bias'] = 100*np.mean((Y_test-Y_test_pred))/np.mean(Y_test)
    print(df)

In [9]:
from sklearn.ensemble import ExtraTreesRegressor
ETR = ExtraTreesRegressor(n_jobs=-1, n_estimators=200, min_samples_split=15, min_samples_leaf=4, max_samples=0.95, max_features=4, max_depth=8, bootstrap=True)
ETR.fit(X_train, Y_train)

Y_train_pred= ETR.predict(X_train)
Y_test_pred = ETR.predict(X_test)

kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_predict, name = 'ETR')

             MAE       RMSE      Bias
ETR                                  
Train  17.818651  43.421932 -0.001547
Test   18.856165  46.733273  3.150073


### Optimization

#### Using random search chapter 14

In [12]:
from sklearn.model_selection import RandomizedSearchCV

max_depth = list(range(6,13)) + [None]
min_samples_split = range(7,16)
min_samples_leaf = range(2,13)
max_features = range(5,13)
bootstrap = [True]
max_samples = [0.7, 0.8, 0.9, 0.95, 1]

param_dist = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'max_features': max_features, 'bootstrap': bootstrap, 'max_samples': max_samples}

ETR = ExtraTreesRegressor(n_jobs=1, n_estimators=30)
ETR_cv = RandomizedSearchCV(ETR, param_dist, cv=5, verbose=2, n_jobs=-1, n_iter=400, scoring='neg_mean_absolute_error')
ETR_cv.fit(X_train, Y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


RandomizedSearchCV(cv=5,
                   estimator=ExtraTreesRegressor(n_estimators=30, n_jobs=1),
                   n_iter=400, n_jobs=-1,
                   param_distributions={'bootstrap': [True],
                                        'max_depth': [6, 7, 8, 9, 10, 11, 12,
                                                      None],
                                        'max_features': range(5, 13),
                                        'max_samples': [0.7, 0.8, 0.9, 0.95, 1],
                                        'min_samples_leaf': range(2, 13),
                                        'min_samples_split': range(7, 16)},
                   scoring='neg_mean_absolute_error', verbose=2)

In [13]:
print('Tuned Forest Parameters:', ETR_cv.best_params_)
print()
Y_train_pred= ETR_cv.predict(X_train)
Y_test_pred = ETR_cv.predict(X_test)
kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_predict, name = 'ETR optimized')

Tuned Forest Parameters: {'min_samples_split': 14, 'min_samples_leaf': 2, 'max_samples': 0.8, 'max_features': 12, 'max_depth': 8, 'bootstrap': True}

                     MAE       RMSE      Bias
ETR optimized                                
Train          15.624624  38.836382  0.161255
Test           18.856165  46.733273  3.150073


#### Using ETR with 200 trees

In [14]:
ETR = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, **ETR_cv.best_params_).fit(X_train, Y_train)

Y_train_pred= ETR.predict(X_train)
Y_test_pred = ETR.predict(X_test)

kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_predict, name = 'ETRx200')

## **ETR_cv is using best params from dictionary line 13

               MAE       RMSE      Bias
ETRx200                                
Train    15.521089  38.443071  0.060833
Test     18.856165  46.733273  3.150073
