# Imports and dataset loading

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.multioutput import MultiOutputRegressor

import pandas as pd

strategyArray = ["Random Frest", "BRET"]
depthArray = np.arange(2, 16)
gainArray = []
lossArray = []

gainRF = []
lossRF = []
gainBRET = []
lossBRET = []

originalDataSet = pd.read_csv('dataset', sep=',',header = None)
scaler = MinMaxScaler()

def loadDataset():    
    scaler.fit(originalDataSet.sample(frac=1).values.reshape(-1, 1))
    dataSet = scaler.transform(originalDataSet.values) 
    
    features, result = np.array(dataSet[:, :4]), np.array(dataSet[:, 4:])
    return features, result

def invertNorm(value):
    auxArray = np.array([value, 0, 0, 0, 0, 0]).reshape(-1, 1)
    return scaler.inverse_transform(auxArray)[0][0]
    
def getError(value1, value2):
    return abs(invertNorm(value1) - invertNorm(value2))

FileNotFoundError: [Errno 2] File dataset does not exist: 'dataset'

# Cross validation function

In [None]:
def crossValidate(regressor, features, result, folds = 5):
    foldSize   = math.ceil(features.shape[0] / folds)
    gainErrors = []
    lossErrors = []
    
    for i in range(folds): 
        sliceBegin = i * foldSize
        sliceEnd   = (i + 1) * foldSize
        
        X_train = np.delete(features, np.s_[sliceBegin: sliceEnd], 0)
        y_train = np.delete(result, np.s_[sliceBegin: sliceEnd], 0)
        
        regressor.fit(X_train, y_train)    
            
        X_test = features[sliceBegin: sliceEnd]
        y_test = result[sliceBegin: sliceEnd]
        
        gainError = 0
        lossError = 0
        
        prediction = regressor.predict(X_test)
        
        for predicted, expected in zip(prediction, y_test):
            #print("Gain: ", invertNorm(predicted[0]), round(invertNorm(expected[0]), 14))
            #print("Loss: ", invertNorm(predicted[1]), round(invertNorm(expected[1]), 14))
            gainError += getError(predicted[0], expected[0])
            lossError += getError(predicted[1], expected[1])
            
        gainErrors.append(gainError / foldSize)
        lossErrors.append(lossError / foldSize)

    return np.array(gainErrors), np.array(lossErrors)

In [None]:
def trainModel(strategy):    
    features, result = loadDataset()
    gainErrors, lossErrors = crossValidate(strategy, features, result)

    print(gainErrors, "=> %0.2f (+/- %0.2f)" % (np.mean(gainErrors), gainErrors.std() * 2))
    print(lossErrors, "=> %0.2f (+/- %0.2f)" % (np.mean(lossErrors), lossErrors.std() * 2))
    
    if isinstance(strategy, RandomForestRegressor):
        gainRF.append(np.mean(gainErrors))
        lossRF.append(np.mean(lossErrors))
    else:
        gainBRET.append(np.mean(gainErrors))
        lossBRET.append(np.mean(lossErrors))
        

# Plotting functions

In [None]:
def plotGainError():
    fig7, axis = plt.subplots(figsize = (10, 5))

    axis.plot(depthArray, gainRF, 'sb-', depthArray, gainBRET, 'or--')
    axis.set_title("Gain error")
    axis.set_ylabel("Absolute error")
    axis.set_ylabel("Max depth")
    axis.legend(strategyArray)

    plt.show()

In [None]:
def plotLossError():
    fig7, axis = plt.subplots(figsize = (10, 5))

    axis.plot(depthArray, lossRF, 'sb-', depthArray, lossBRET, 'or--')
    axis.set_title("Loss error")
    axis.set_ylabel("Absolute error")
    axis.set_ylabel("Max depth")
    axis.legend(strategyArray)

    plt.show()

# Putting all together

In [None]:
for depth in depthArray:
    randomForest = RandomForestRegressor(max_depth = depth, random_state = 0, n_estimators = 100)
    trainModel(randomForest)

print("Done!")

In [None]:
for depth in depthArray:
    extra_tree = ExtraTreeRegressor(criterion = "mae", max_depth = depth, random_state=0)
    bret = MultiOutputRegressor(BaggingRegressor(base_estimator=extra_tree, n_estimators=100, random_state=0))
    trainModel(bret)

print("Done!")

# Plottin error boxplots

In [None]:
plotGainError()
plotLossError()