In [1]:
#Adaptive Boosting

In [2]:
import numpy as np 
import pandas as pd
import math
import scipy.stats as stats
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
def import_data():
 data = pd.read_csv('norway_new_car_sales_by_make.csv')
 
 data['Period'] = data['Year'].astype(str) + '-' + data['Month'].astype(str).str.zfill(2)
 
 df = pd.pivot_table(data=data,values='Quantity',index='Make',columns='Period',aggfunc='sum',fill_value=0)
 return df

In [4]:
def datasets(df, x_len=12, y_len=1, test_loops=12):
 D = df.values
 rows, periods = D.shape
 # Training set creation
 loops = periods + 1- x_len- y_len
 train = []
 for col in range(loops):
    train.append(D[:,col:col+x_len+y_len])
 
 train = np.vstack(train)
 print(train.shape)
 X_train, Y_train = np.split(train,[-y_len],axis=1)
 # Test set creation
 if test_loops > 0:
    X_train, X_test = np.split(X_train,[-rows*test_loops],axis=0)

    Y_train, Y_test = np.split(Y_train,[-rows*test_loops],axis=0)
 else: # No test set: X_test is used to generate the future forecast
    X_test = D[:,-x_len:]
    Y_test = np.full((X_test.shape[0],y_len),np.nan) #Dummy value
 # Formatting required for scikit-learn
 if y_len == 1:
    Y_train = Y_train.ravel()
    Y_test = Y_test.ravel()
 return X_train, Y_train, X_test, Y_test

In [5]:
def kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name=''):
 df = pd.DataFrame(columns = ['MAE','RMSE','Bias'],index=['Train','Test'])
 df.index.name = name
 
 df.loc['Train','MAE'] = 100*np.mean(abs(Y_train-Y_train_pred))/np.mean(Y_train)

 df.loc['Train','RMSE'] = 100*np.sqrt(np.mean((Y_train-Y_train_pred)**2))/np.mean(Y_train)
 
 df.loc['Train','Bias'] = 100*np.mean((Y_train- Y_train_pred))/np.mean(Y_train)

 df.loc['Test','MAE'] = 100*np.mean(abs(Y_test- Y_test_pred))/np.mean(Y_test)
 
 df.loc['Test','RMSE'] = 100*np.sqrt(np.mean((Y_test-Y_test_pred)**2))/np.mean(Y_test)
 
 df.loc['Test','Bias'] = 100*np.mean((Y_test- Y_test_pred))/np.mean(Y_test)
 df = df.astype(float).round(1) #Round number for display
 print(df)

In [6]:
df = import_data()
X_train, Y_train, X_test, Y_test = datasets(df, x_len=12, y_len=1,test_loops=12)

(7085, 13)


In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8), n_estimators=100, learning_rate=0.25, loss='square')
ada = ada.fit(X_train,Y_train)

In [8]:
Y_train_pred = ada.predict(X_train)
Y_test_pred = ada.predict(X_test)
kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name='AdaBoost')

           MAE  RMSE  Bias
AdaBoost                  
Train     10.0  21.1  -0.5
Test      18.4  48.5   2.1


In [10]:
#parameter optimization
n_estimators = [100]
learning_rate = [0.005,0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35]
loss = ['square','exponential','linear']
param_dist = {'n_estimators': n_estimators,'learning_rate': learning_rate,'loss':loss}

In [11]:
def model_mae(model, X, Y):
 Y_pred = model.predict(X)
 mae = np.mean(np.abs(Y- Y_pred))/np.mean(Y)
 return mae

In [12]:
from sklearn.model_selection import RandomizedSearchCV
results = []
for max_depth in range(2,18,2):
 ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=max_depth))
 
 ada_cv = RandomizedSearchCV(ada, param_dist, n_jobs=-1, cv=6, n_iter=20, scoring='neg_mean_absolute_error')
 ada_cv.fit(X_train,Y_train)
 print('Tuned AdaBoost Parameters:',ada_cv.best_params_)
 print('Result:',ada_cv.best_score_)

 results.append([ada_cv.best_score_,ada_cv.best_params_,max_depth])
results = pd.DataFrame(data=results, columns=['Score','Best Params','Max Depth'])
optimal = results['Score'].idxmax()
print(results.iloc[optimal])

Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'square', 'learning_rate': 0.01}
Result: -47.88967219205247
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'linear', 'learning_rate': 0.01}
Result: -33.90827547191039
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'linear', 'learning_rate': 0.005}
Result: -31.65968371937797
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'square', 'learning_rate': 0.005}
Result: -31.475350608064605
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'linear', 'learning_rate': 0.005}
Result: -31.67021897136584
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'exponential', 'learning_rate': 0.01}
Result: -31.811118698059953
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'exponential', 'learning_rate': 0.005}
Result: -31.67195153609393
Tuned AdaBoost Parameters: {'n_estimators': 100, 'loss': 'exponential', 'learning_rate': 0.005}
Result: -31.680411893036467
Score                                    

In [13]:
#Model with optimal parameters
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8),n_estimators=100,learning_rate=0.005,loss='linear')
ada = ada.fit(X_train,Y_train)
Y_train_pred = ada.predict(X_train)
Y_test_pred = ada.predict(X_test)
kpi_ML(Y_train, Y_train_pred, Y_test, Y_test_pred, name='AdaBoost optimized')

                     MAE  RMSE  Bias
AdaBoost optimized                  
Train               10.8  24.8   0.5
Test                17.7  47.2   3.6
