In [4]:
import pandas as pd
import numpy as np

def import_data():
    data = pd.read_csv('norway_new_car_sales_by_make.csv')
    data['Period'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)
    df=pd.pivot_table(data=data, values='Quantity', index='Make', columns='Period', aggfunc='sum', fill_value=0)
    return df

In [5]:
def datasets(df, x_len=12, y_len=1, test_loops=12):
    
    D = df.values
    rows, periods = D.shape
    
    # Training set creation
    loops = periods + 1 - x_len - y_len
    train = []
    
    for col in range(loops):
        train.append(D[:,col:col+x_len+y_len])
    train = np.vstack(train)
    X_train, Y_train = np.split(train,[-y_len], axis=1)
    
    # Test set creation
    
    if test_loops>0:
        X_train, X_test = np.split(X_train, [-rows*test_loops], axis=0)
        Y_train, Y_test = np.split(Y_train, [-rows*test_loops], axis=0)
    else:  # No test set: X_test is ised to generate the future forecast
        X_test = D[:,-x_len:]
        Y_test = np.full((X_test.shape[0],y_len),np.nan)
        
    # Formatting required for scikit-learn
    
    if y_len == 1:
        Y_train = Y_train.ravel()
        Y_test = Y_test.ravel()
        
    return X_train, Y_train, X_test, Y_test
         

In [6]:
df = import_data()
X_train, Y_train, X_test, Y_test = datasets(df, x_len=12, y_len=1, test_loops=12)
df.to_excel('demand.xlsx', index=False)
print(df)

Period        2007-01  2007-02  2007-03  2007-04  2007-05  2007-06  2007-07  \
Make                                                                          
Alfa Romeo         16        9       21       20       17       21       14   
Aston Martin        0        0        1        0        4        3        3   
Audi              599      498      682      556      630      498      562   
BMW               352      335      365      360      431      477      403   
Bentley             0        0        0        0        0        1        0   
...               ...      ...      ...      ...      ...      ...      ...   
Think               2        0        0        1        0        0        0   
Toyota           2884     1885     1833     1300     1866     1620     1901   
Volkswagen       2521     1517     1428     1257     1934     1531     1777   
Volvo             693      570      656      587      805      662     1064   
Westfield           0        0        0        0    

In [7]:
from sklearn.tree import DecisionTreeRegressor

# Instantiate a Decision Tree Regressor

tree = DecisionTreeRegressor(max_depth=5, min_samples_split=15, min_samples_leaf=5)

# Fit the tree to the training data

tree.fit(X_train, Y_train)

DecisionTreeRegressor(max_depth=5, min_samples_leaf=5, min_samples_split=15)

In [8]:
max_depth = list(range(5,11)) + [None]
min_samples_split = range(5,20)
min_samples_leaf = range(2,20)
params_dist = {'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf}

In [11]:
def kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name=''):
    df = pd.DataFrame(columns=['MAE', 'RMSE', 'Bias'], index = ['Train', 'Test'])
    df.index.name=name
    df.loc['Train', 'MAE'] = 100*np.mean(abs(Y_train-Y_train_pred))/np.mean(Y_train)
    df.loc['Train', 'RMSE'] = 100*np.sqrt(np.mean((Y_train-Y_train_pred)**2))/np.mean(Y_train)
    df.loc['Train', 'Bias'] = 100*np.mean((Y_train-Y_train_pred))/np.mean(Y_train)
    df.loc['Test', 'MAE'] = 100*np.mean(abs(Y_test-Y_test_pred))/np.mean(Y_test)
    df.loc['Test', 'RMSE'] = 100*np.sqrt(np.mean((Y_test-Y_test_pred)**2))/np.mean(Y_test)
    df.loc['Test', 'Bias'] = 100*np.mean((Y_test-Y_test_pred))/np.mean(Y_test)
    print(df)

# For testing different parameters and return a best

In [21]:
from sklearn.model_selection import RandomizedSearchCV
tree = DecisionTreeRegressor()
tree_cv = RandomizedSearchCV(tree, params_dist, n_jobs =-1, cv=10, verbose=1, n_iter=100, scoring='neg_mean_absolute_error')
"cv is a number of folds"

tree_cv.fit(X_train, Y_train)
print('Tuned Regression Tree Parameter', tree_cv.best_params_)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits
Tuned Regression Tree Parameter {'min_samples_split': 15, 'min_samples_leaf': 18, 'max_depth': 7}


In [22]:
Y_train_pred = tree_cv.predict(X_train)
Y_test_pred = tree_cv.predict(X_test)
kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name='Tree')

             MAE       RMSE      Bias
Tree                                 
Train  16.799632  41.369892       0.0
Test   18.128781  45.516976  3.086413


In [19]:
df = pd.DataFrame(tree_cv.cv_results_)


In [20]:
df_params = pd.DataFrame(df['params'].values.tolist())
df = pd.concat([df_params,df], axis=1)
df.to_excel('Result.xlsx')