In [None]:
from functions import *
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [None]:
# Functions
def LoadBatch(file): 
    
    with open("data/"+file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    
    pixelDat = dict[b'data']
    labels = dict[b'labels']
    labelsOneHot = np.zeros((len(labels),10))
    
    for index in range(len(labels)): # Not efficient :)
        labelsOneHot[index][labels[index]] = 1
        
    return pixelDat, labelsOneHot, labels

def CalcS(X, W, b):
    return W @ X + b

def getXk(X, W, b): # For the intermediary steps
    return np.maximum(0, CalcS(X, W, b))

def getP(X, W, b): # For the final step
    return softmax(CalcS(X, W, b))

def batchNormalize(S, mean, variance): # 11
    t1 = (S - mean[:, np.newaxis])
    t2 = np.diag(np.sqrt(variance + np.finfo(float).eps))
    t2inv = np.linalg.inv(t2)
    S_hat = np.dot(t2inv, t1)
    return S_hat

def forwardPass(X_batch, WList, bList, gammaList=None, betaList=None, mean=None, var=None, doBatchNormalization=False):
    Xk = X_batch
    XList = [Xk] # Get history for gradient calculations
    S_List, S_hatList, meanList, varList = [], [], [], []
    WLast = WList.pop() # Remove the last weight matrix
    bLast = bList.pop()

    for index, [W, b] in enumerate(zip(WList, bList)): # Do all intermediary calculations
        if doBatchNormalization: # Get mean and variance for batch normalization
            S = CalcS(Xk, W, b)
            S_List.append(S)
            mean, variance = np.mean(S, axis=1), np.var(S, axis=1)
            S_hat = batchNormalize(S, mean, variance) 
            S_tilda = np.multiply(S_hat, gammaList[index]) + betaList[index] # Apply shift and scale, does not seem to have to be implemented
            Xk = np.maximum(0, S_tilda)
            S_hatList.append(S_hat), meanList.append(mean), varList.append(variance)
        else:
            Xk = getXk(Xk, W, b)
        XList.append(Xk)
    S = CalcS(Xk, WLast, bLast)
    S_List.append(S)
    P = softmax(S) # Get the final probability
    return P, XList, S_hatList, meanList, varList, S_List

def batchNormBackPass(G_vec, S_Batch, mean, var):
    sig1 = np.power(var + np.finfo(float).eps, -0.5).T # 31
    sig2 = np.power(var + np.finfo(float).eps, -1.5).T #32
    G1 = np.multiply(G_vec, sig1[:, np.newaxis]) # 33
    G2 = np.multiply(G_vec, sig2[:, np.newaxis]) # 34
    D = S_Batch - mean[:, np.newaxis] # 35
    c = np.sum(np.multiply(G2, D), axis=1)[:, np.newaxis] # 36
    denom = len(S_Batch[0])
    t1 = np.sum(G1, axis=1)[:, np.newaxis] # 37a
    t2 = (np.multiply(D, c)) # 37b
    G_vec = G1 - (t1 / denom) - (t2 / denom) # 37c
    return G_vec

def backwardPass(P_batch, X_batch, XList, Y_batch, WList, lamb, n_batch, S_List, S_hatList=None, gammaList=None, meanList=None, varList=None, doBatchNormalization=False):
    if doBatchNormalization:
        # Walk backwards through the intermediary calculations
        XList.reverse(), WList.reverse(),gammaList.reverse(),varList.reverse(),meanList.reverse(), S_hatList.reverse(),S_List.reverse()
        W_last = WList.pop() # Get the last weight matrix for the initial computation
        G_vec = - (Y_batch-P_batch) # Initialize G_vec
        dJdW_list, dJdb_list, dJdgamma_list, dJdbeta_list = [], [], [], []
        a = 0 # Debugging
        for index, [Xk, W] in enumerate(zip(XList, WList)): # Loop through all intermediary steps
            if index == 0: # First computation, last gradient
                a += 1
                Xk_batch = Xk
                dJdW = ((G_vec @ Xk_batch.T))/n_batch
                dJdb = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch
                dJdW_list.append(dJdW + 2*lamb*W), dJdb_list.append(dJdb)
                G_vec = W.T @ G_vec # Update G_vec for the next computation
                Xk_batch[Xk_batch<0] = 0
                Xk_batch[Xk_batch>0] = 1
                G_vec = np.multiply(G_vec, Xk_batch)
            else: # Compute the gradient with respect to the batch normalization parameters
                Xk_batch = Xk
                dJdgamma = np.sum(np.multiply(G_vec, S_hatList[index-1]), axis=1)[:, np.newaxis]/n_batch # 25a
                dJdbeta = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch # 25b
                G_vec = G_vec * gammaList[index-1] # 26
                G_vec = batchNormBackPass(G_vec, S_List[index], meanList[index-1], varList[index-1]) # 27
                dJdW = ((G_vec @ Xk.T))/n_batch + 2*lamb*W
                dJdb = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch
                dJdW_list.append(dJdW), dJdb_list.append(dJdb), dJdgamma_list.append(dJdgamma), dJdbeta_list.append(dJdbeta)
                a += 1
                
                G_vec = W.T @ G_vec # Update G_vec for the next computation
                Xk_batch[Xk_batch<0] = 0
                Xk_batch[Xk_batch>0] = 1
                G_vec = np.multiply(G_vec, Xk_batch)

        # Get the final gradient from the input X, (first gradient)
        dJdgamma = np.sum(np.multiply(G_vec, S_hatList[-1]), axis=1)[:, np.newaxis]/n_batch # 25a
        dJdbeta = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch # 25b
        G_vec = G_vec * gammaList[-1] # 26
        G_vec = batchNormBackPass(G_vec, S_List[-1], meanList[-1], varList[-1]) # 27
        dJdW = ((G_vec @ X_batch.T))/n_batch + 2*lamb*W_last
        dJdb = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch
        dJdW_list.append(dJdW), dJdb_list.append(dJdb), dJdgamma_list.append(dJdgamma), dJdbeta_list.append(dJdbeta)
        
        a += 1
        dJdW_list.reverse(), dJdb_list.reverse(), dJdgamma_list.reverse(), dJdbeta_list.reverse()

        return dJdW_list, dJdb_list, dJdgamma_list, dJdbeta_list
    else:
        XList.reverse() # Walk backwards through the intermediary calculations
        WList.reverse()
        W_last = WList.pop() # Get the last weight matrix for the initial computation
        G_vec = - (Y_batch-P_batch) # Initialize G_vec
        dJdW_list = []
        dJdb_list = []
        for Xk, W in zip(XList, WList): # Loop through all intermediary steps
            Xk_batch = Xk
            dJdW = ((G_vec @ Xk_batch.T))/n_batch
            dJdb = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch
            dJdW_list.append(dJdW + 2*lamb*W)
            dJdb_list.append(dJdb)
            
            G_vec = W.T @ G_vec # Update G_vec for the next computation
            Xk_batch[Xk_batch<0] = 0
            Xk_batch[Xk_batch>0] = 1
            G_vec = np.multiply(G_vec, Xk_batch)
        
        # Get the final gradient from the input X
        dJdW = ((G_vec @ X_batch.T))/n_batch
        dJdb = np.sum(G_vec, axis=1)[:, np.newaxis]/n_batch
        dJdW_list.append(dJdW + 2*lamb*W_last)
        dJdb_list.append(dJdb)
        dJdW_list.reverse() 
        dJdb_list.reverse()
        
        return dJdW_list, dJdb_list, None, None

def ComputeCost(X, Y, W_list, b_list, lamb, gammaList, betaList, movingMeanList=[], movingVarList=[], doBatchNormalization=False): 
    P,_,_,_,_,_ = forwardPass(X, W_list.copy(), b_list.copy(), gammaList.copy(), betaList.copy(), movingMeanList.copy(), movingVarList.copy(), doBatchNormalization)
    J = 0
    P_t = np.clip(P.T, 1e-15, None)
    for i in range(len(P_t)):
        J += -np.dot(Y[i], np.log(P_t[i], where=P_t[i] > 0))
    J /= len(X[0]) # Divide by dimensionality
    loss = J # For documentation
    J += lamb * (np.sum([np.sum(np.power(W,2)) for W in W_list])) # Regularization
    
    return J, loss

def ComputeAccuracy(X, y, W_list, b_list, gammaList, betaList, movingMeanList=[], movingVarList=[], doBatchNormalization=True): 
    P,_,_,_,_,_ = forwardPass(X, W_list.copy(), b_list.copy(), gammaList.copy(), betaList.copy(), movingMeanList.copy(), movingVarList.copy(), doBatchNormalization)
    nCorr = 0
    for index in range(X.T.shape[0]):
        p = P.T[index]
        predClass = np.argmax(p)
        if predClass == y[index]:
            nCorr += 1
    
    acc = nCorr/X.T.shape[0]
    return acc

def updateWeights(Var_list, dJdVar_list, lr):
    Var_list = [Var - lr*dJdVar for Var, dJdVar in zip(Var_list, dJdVar_list)]
    return Var_list

def init_variables2(): # More training data
    X_train, Y_train, X_val, Y_val, X_test, Y_test = None, None, None, None, None, None
    y_train = None
    for file in ["data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4", "data_batch_5", "test_batch"]:
        X, Y, y = LoadBatch(file)
        mean_X = np.mean(X, axis=0) 
        std_X = np.std(X, axis=0)
        X = X - mean_X
        X = X / std_X
        X = X.T # Make x stored in columns
        if file in ["data_batch_1","data_batch_3", "data_batch_4", "data_batch_5"]:
            if X_train is None:
                X_train = X
                Y_train = Y
                y_train = y
            else:
                X_train = np.concatenate((X_train, X), axis=1)
                Y_train = np.concatenate((Y_train, Y), axis=0)
                y_train += y
            
        elif file == "data_batch_2":
            X_val,Y_val, y_val = X.T[0:5000].T, Y[0:5000], y[0:5000]
            X_train = np.concatenate((X_train, X.T[5000:].T), axis=1)
            Y_train = np.concatenate((Y_train, Y[5000:]), axis=0)
            y_train += y[5000:]
        else:
            X_test,Y_test, y_test = X, Y, y
           
    return X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test

def init_variables(intermediary_layer_sizes=[50, 50], all=False): # Excercise 1, [50, 30, 20, 20, 10, 10, 10, 10]
    X_train, Y_train, X_val, Y_val, X_test, Y_test = None, None, None, None, None, None
    if all:
        X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test = init_variables2()
    else:
        for file in ["data_batch_1", "data_batch_2", "test_batch"]:
            X, Y, y = LoadBatch(file)
            mean_X = np.mean(X, axis=0) 
            std_X = np.std(X, axis=0)
            X = X - mean_X
            X = X / std_X
            X = X.T # Make x stored in columns
            if file == "data_batch_1":
                X_train,Y_train, y_train = X, Y, y
            elif file == "data_batch_2":
                X_val,Y_val, y_val = X, Y, y
            else:
                X_test,Y_test, y_test = X, Y, y
    
    initExperiment = False
    sigma = 0.0001 # 0.1, 0.001, 0.0001
    
    np.random.seed(111)
    eta_min = 0.00001
    eta_max = 0.1
    K = 10 # Number of labels
    d = len(X_train.T[0]) # dimensionality
    n = 0
    W_list = []
    b_list = []
    prevSize = d
    layer_sizes = intermediary_layer_sizes + [K]
    gammaList = []
    betaList = []
    for size in layer_sizes: # Initialize weights and biases for all layers
        if n == len(layer_sizes)-1: # If last layer
            if initExperiment:
                staticNormalizedW = np.random.normal(0, sigma, (K, prevSize))
                W_list.append(staticNormalizedW)
                print('experiment')
            else:
                W_list.append(np.random.normal(0, 2/np.sqrt(prevSize), (K, prevSize)))
                print('standard')
            b_list.append(np.zeros((K,1)))
        else:
            if initExperiment:
                staticNormalizedW = np.random.normal(0, sigma, (size, prevSize))
                W_list.append(staticNormalizedW)
                print('experiment')
            else:
                W_list.append(np.random.normal(0, 2/np.sqrt(prevSize), (size, prevSize)))
                print('standard')
            b_list.append(np.zeros((size,1)))
            gammaList.append(np.ones((size,1)))
            betaList.append(np.zeros((size,1)))
            prevSize = size
            
        # W1 = np.random.normal(0, 1/np.sqrt(d), (m, d)) 
        # W2 = np.random.normal(0, 1/np.sqrt(m), (K, m)) 
        # b1 = np.zeros((m,1)) # np.random.normal(0, 0.01, (m,1))
        # b2 = np.zeros((K,1)) # np.random.normal(0, 0.01, (K,1))
        n += 1
    n_s=800 
    return X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, W_list, b_list, eta_min, eta_max, n_s, gammaList, betaList


In [None]:
X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, W_list, b_list, eta_min, eta_max,_, gammaList, betaList = init_variables()

In [None]:
# lamb = 0
# P, X_List, S_hatList, meanList, varList, S_List = forwardPass(X_train, W_list.copy(), gammaList.copy(), betaList.copy(), b_list.copy(), doBatchNormalization=True)

In [None]:
# dJdW_list, dJdb_list, dJdgamma_list, dJdbeta_list = backwardPass(P, X_batch, X_List.copy(), Y_batch,W_list.copy(), 0.001, 100, S_List.copy(), S_hatList.copy(),gammaList.copy(), meanList.copy(), varList.copy(), doBatchNormalization)

In [None]:
# import grads_check
# J,_,_ = ComputeCost(X_train, Y_train, W_list, b_list, lamb, gammaList, betaList, None, None, True)
# acc = ComputeAccuracy(X_train, y_train, W1, W2, b1, b2)
# grad_W1, grad_W2, grad_b1, grad_b2 = grads_check.ComputeGradients(X_train, Y_train, P, H, W1, W2, lamb, b_start=0, b_size=10)

# Train the network

In [None]:
lrhist = []
def getLr(t, eta_min, eta_max, n_s):
    if (int(t/n_s))%2 == 0:
        return eta_min + (eta_max-eta_min)*((t%n_s)/n_s)
    else:
        return eta_max - (eta_max-eta_min)*((t%n_s)/n_s)

def suffleDatapoints(X, Y,y):
    p = np.random.permutation(len(Y))
    return X.T[p].T, Y[p], np.asarray(y)[p]
  
def MiniBatchGD(X, Y, y, W_list, b_list, lamb, n_epochs, n_batch, eta_min, eta_max, X_val, Y_val, y_val, n_s, gammaList, betaList):         
    acc_hist,cost_hist, loss_hist, acc_hist_val, cost_hist_val, loss_hist_val,loss_hist_val = [] ,[], [], [], [], [], []
    # Train, initial val
    acc = ComputeAccuracy(X, y, W_list, b_list, gammaList, betaList)
    cost, loss = ComputeCost(X, Y, W_list, b_list, lamb, gammaList, betaList)
    acc_hist.append(acc), cost_hist.append(cost), loss_hist.append(loss)
    # Validation, initial val
    acc = ComputeAccuracy(X_val, y_val, W_list, b_list, gammaList, betaList)
    cost, loss = ComputeCost(X_val, Y_val, W_list, b_list, lamb, gammaList, betaList)
    acc_hist_val.append(acc), cost_hist_val.append(cost), loss_hist_val.append(loss)
    t = 0
    doBatchNormalization = True
    movingMeanList = []
    movingVarList = []
    lr = eta_min
    total_batches = int(len(Y)/n_batch)
    for epoch in range(n_epochs): # Main loop
        X, Y, y = suffleDatapoints(X, Y, y)
        for batch in range(total_batches):
            t+=1
            lrhist.append(lr)
            X_batch = X.T[batch*n_batch:batch*n_batch+n_batch].T
            Y_batch = Y[batch*n_batch:batch*n_batch+n_batch].T
            
            P_Batch, X_List, S_hatList, meanList, varList, S_List = forwardPass(X_batch, W_list.copy(), b_list.copy(), gammaList.copy(), betaList.copy(), None, None, doBatchNormalization)

            dJdW_list, dJdb_list, dJdgamma_list, dJdbeta_list = backwardPass(P_Batch, X_batch, X_List.copy(), Y_batch,W_list.copy(), lamb, n_batch, S_List.copy(), S_hatList.copy(),gammaList.copy(), meanList.copy(), varList.copy(), doBatchNormalization)
            
            W_list = updateWeights(W_list, dJdW_list, lr)
            b_list = updateWeights(b_list, dJdb_list, lr)
            if doBatchNormalization:
                gammaList = updateWeights(gammaList, dJdgamma_list, lr)
                betaList = updateWeights(betaList, dJdbeta_list, lr)
            
            # Do exponential moving average for the mean and variance
            if batch == 0:
                movingMeanList = meanList
                movingVarList = varList
            else:
                movingMeanList = [0.9*mm + 0.1*m for mm, m in zip(movingMeanList, meanList)]
                movingVarList = [0.9*mv + 0.1*v for mv, v in zip(movingVarList, varList)]
            lr = getLr(t, eta_min, eta_max, n_s)
            
        # Train
        acc = ComputeAccuracy(X, y, W_list, b_list, gammaList, betaList, movingMeanList, movingVarList, doBatchNormalization)
        cost, loss = ComputeCost(X, Y, W_list, b_list, lamb, gammaList, betaList, movingMeanList, movingVarList, doBatchNormalization)
        acc_hist.append(acc), cost_hist.append(cost), loss_hist.append(loss)
        # Validation
        acc = ComputeAccuracy(X_val, y_val, W_list, b_list, gammaList, betaList, movingMeanList, movingVarList, doBatchNormalization)
        cost, loss = ComputeCost(X_val, Y_val, W_list, b_list, lamb, gammaList, betaList, movingMeanList, movingVarList, doBatchNormalization)
        acc_hist_val.append(acc), cost_hist_val.append(cost), loss_hist_val.append(loss)
        print("Epoch:", epoch, ": Train accuracy:", acc_hist[-1], ", Val accuracy:", acc_hist_val[-1])
        
    return W_list, b_list, cost_hist, acc_hist, loss_hist, cost_hist_val, acc_hist_val, loss_hist_val, movingMeanList, movingVarList, gammaList, betaList



Whist, bhist, cost_hist_list, acc_hist_list, loss_hist_list, cost_hist_list_val, acc_hist_list_val, loss_hist_list_val = [], [], [], [], [], [], [], []
gammaListHist, betaListHist, movingMeanListHist, movingVarListHist = [], [], [], []
X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, W_list, b_list, eta_min, eta_max, n_s, gammaList, betaList = init_variables(all=True) 

n_epochs = 30 # 3 cycles
n_batch = 100
n_s = 5 * int(len(X_train[0])) / n_batch
lamb = 0.005
W_list, b_list, cost_hist, acc_hist, loss_hist, cost_hist_val, acc_hist_val, loss_hist_val, movingMeanList, movingVarList, gammaList, betaList = MiniBatchGD(X=X_train, Y=Y_train, y=y_train, W_list=W_list, b_list=b_list, lamb=lamb, n_epochs=n_epochs, n_batch=n_batch, eta_min=eta_min, eta_max=eta_max, X_val=X_val, Y_val=Y_val, y_val=y_val, n_s=n_s, gammaList=gammaList, betaList=betaList)
Whist.append(W_list), bhist.append(b_list), cost_hist_list.append(cost_hist), acc_hist_list.append(acc_hist), loss_hist_list.append(loss_hist), cost_hist_list_val.append(cost_hist_val), acc_hist_list_val.append(acc_hist_val), loss_hist_list_val.append(loss_hist_val)
gammaListHist.append(gammaList), betaListHist.append(betaList), movingMeanListHist.append(movingMeanList), movingVarListHist.append(movingVarList)

# Search for a good lambda

In [None]:
def lambdaSearch():
    Whist, bhist, cost_hist_list, acc_hist_list, loss_hist_list, cost_hist_list_val, acc_hist_list_val, loss_hist_list_val = [], [], [], [], [], [], [], []
    X_train, Y_train, y_train, X_val, Y_val, y_val, X_test, Y_test, y_test, W_list, b_list, eta_min, eta_max, n_s, gammaList, betaList = init_variables(all=True) 
    gammaListHist, betaListHist, movingMeanListHist, movingVarListHist = [], [], [], []
    
    
    lamb = 0.01
    n_epochs = 30 # 1.5 
    n_batch = 100
    n_s = 5 * int(len(X_train[0])) / n_batch
    lamb = 0.0045
    lambdas = [0.00475] # , 0.01, 0.005, 0.001, 0.0005, 0.0001. ####### 0.01 - 0.001: 0.0025, 0.0075, 0.00375, 0.00675, 0.00425, 0.00575
    for lamb in lambdas:
        W_list, b_list, cost_hist, acc_hist, loss_hist, cost_hist_val, acc_hist_val, loss_hist_val, movingMeanList, movingVarList, gammaList, betaList = MiniBatchGD(X=X_train.copy(), Y=Y_train.copy(), y=y_train.copy(), W_list=W_list.copy(), b_list=b_list.copy(), lamb=lamb, n_epochs=n_epochs, n_batch=n_batch, eta_min=eta_min, eta_max=eta_max, X_val=X_val.copy(), Y_val=Y_val.copy(), y_val=y_val.copy(), n_s=n_s, gammaList=gammaList.copy(), betaList=betaList.copy())
        Whist.append(W_list), bhist.append(b_list), cost_hist_list.append(cost_hist), acc_hist_list.append(acc_hist), loss_hist_list.append(loss_hist), cost_hist_list_val.append(cost_hist_val), acc_hist_list_val.append(acc_hist_val), loss_hist_list_val.append(loss_hist_val)
        gammaListHist.append(gammaList), betaListHist.append(betaList), movingMeanListHist.append(movingMeanList), movingVarListHist.append(movingVarList)
    
    x = [i for i in range(n_epochs+1)]
    for index, [lamb, accval, acctrain] in enumerate(zip(lambdas, acc_hist_list_val, acc_hist_list)):
        plt.clf()
        plt.title("Accuracy graph, lambda = " + str(lamb))
        plt.plot(x, acctrain, label = "Training")
        plt.plot(x, accval, label = "Valuation")
        plt.legend()
        plt.show()
        print("Final test accuracy (lamda = "+str(lamb)+"):", ComputeAccuracy(X_test, y_test, Whist[index], bhist[index], gammaListHist[index], betaListHist[index], movingMeanListHist[index], movingVarListHist[index], True))
    return X_test, y_test, Whist, bhist, movingMeanList, movingVarList, gammaList, betaList
        
# X_test, y_test, Whist, bhist, movingMeanList, movingVarList, gammaList, betaList = lambdaSearch()

# Plot

In [None]:
x = [i for i in range(len(lrhist))]
plt.clf()
plt.title("Learning rate")
plt.plot(x, lrhist, label = "Lr")
plt.legend()
plt.show()

In [None]:
#Plotting
x = [i for i in range(n_epochs+1)]

for i in range(len(Whist)):
    Whist_t, bhist_t, cost_hist_t, acc_hist_t, loss_hist_t, cost_hist_val_t, acc_hist_val_t, loss_hist_val_t = Whist[i], bhist[i], cost_hist_list[i], acc_hist_list[i], loss_hist_list[i], cost_hist_list_val[i], acc_hist_list_val[i], loss_hist_list_val[i]
    gammaListHist_t, betaListHist_t, movingMeanListHist_t, movingVarListHist_t = gammaListHist[i], betaListHist[i], movingMeanListHist[i], movingVarListHist[i]
    print("\nLambda:", lamb, "\n----------------------------------")
    plt.clf()
    plt.title("Cost graph")
    plt.plot(x, cost_hist_t, label = "Training")
    plt.plot(x, cost_hist_val_t, label = "Valuation")
    plt.legend()
    plt.show()
    plt.clf()
    plt.title("Loss graph")
    plt.plot(x, loss_hist_t, label = "Training")
    plt.plot(x, loss_hist_val_t, label = "Valuation")
    plt.legend()
    plt.show()
    plt.clf()
    plt.title("Accuracy graph")
    plt.plot(x, acc_hist_t, label = "Training")
    plt.plot(x, acc_hist_val_t, label = "Valuation")
    plt.legend()
    plt.show()
    print("Final test accuracy:", ComputeAccuracy(X_test, y_test, Whist_t, bhist_t, gammaListHist_t, betaListHist_t, movingMeanListHist_t, movingVarListHist_t, True))