#COMP 551 A2 - Team 51
Mike Gao, Isaac Meadowcroft-Grijalva, Linhui Huang


In [None]:
import numpy as np
import timeit
np.random.seed(42)

In [None]:
# load datasets
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml

1. Digits Dataset

In [None]:
digits_x, digits_y = load_digits(return_X_y=True)
digits_y = digits_y.astype(int)
print(digits_x.shape, digits_y.shape, np.unique(digits_y))

(1797, 64) (1797,) [0 1 2 3 4 5 6 7 8 9]


2. OpenML Iris Dataset

In [None]:
iris_x, iris_y = fetch_openml(name='iris', version=1, return_X_y=True)
iris_label_map = {k: v for v, k in enumerate(sorted(list(set(iris_y))))}
vectorize = np.vectorize(lambda map_, x: map_[x])
iris_y = vectorize(iris_label_map, iris_y) 
print(iris_x.shape, iris_y.shape, np.unique(iris_y))

(150, 4) (150,) [0 1 2]


3. OpenML Letters Dataset

In [None]:
letter_x, letter_y = fetch_openml(name='letter', version=1, return_X_y=True)
letter_label_map = {k: v for v, k in enumerate(sorted(list(set(letter_y))))}
vectorize = np.vectorize(lambda map_, x: map_[x])
letter_y = vectorize(letter_label_map, letter_y) 
print(letter_x.shape, letter_y.shape, np.unique(letter_y))

(20000, 16) (20000,) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]


In [None]:
def softmax(x):
      return np.exp(x) / np.sum(np.exp(x), axis=0)

def one_hot_encoding(y, k):
    one_hot = np.zeros((len(y), k))
    one_hot[np.arange(len(y)), y] = 1
    return one_hot
    
class SoftmaxRegression:
    def __init__(self, total_cls):
        self.total_cls = total_cls
        self.w = None

    def fit(self, x_train, y_train, x_val, y_val, optimizer):
        N,D = x_train.shape
        def gradient(x, y, w):
            yh =  softmax(np.matmul(x, w))
            grad = 1/N * np.matmul(x.T, (yh - one_hot_encoding(y, self.total_cls)))
            return grad
        w0 = np.zeros((D, self.total_cls))
        self.w, history = optimizer.run(gradient, x_train, y_train, x_val, y_val, w0, self.total_cls)
        return history
    
    def predict(self, x):
        return softmax(np.matmul(x, self.w))

In [None]:
from collections import defaultdict
from sklearn.metrics import mean_squared_error

In [None]:
class MiniBatchGradientDescent:

  def __init__(self, batch_size = 256, learning_rate = 0.01, momentum = 0.9, l2_regularization = 0.1, termination_step = 20):
      self.batch_size = batch_size
      self.learning_rate = learning_rate
      self.momentum = momentum
      self.l2_regularization = l2_regularization
      self.termination_step = termination_step

  def create_mini_batches(self, x):
      lst = list(range(len(x)))
      np.random.shuffle(lst)
      return np.array_split(lst, self.batch_size)

  def cost(self, x, y, w, k):
      z = np.matmul(x, w)
      z -= np.max(z)
      return np.mean(-1 * np.sum(one_hot_encoding(y, k) * z - np.log(np.sum(np.exp(z)))))

  def accuracy(self, x, y, w):
      a = np.argmax(softmax(np.matmul(x, w)), axis=1)
      sum = np.count_nonzero(a==y)
      return sum/len(y)
      # return mean_squared_error(np.argmax(softmax(np.matmul(x, w)), axis=1), y)

  def run(self, gradient_function, x_train, y_train, x_validation, y_validation, w, k):
      grad = np.inf
      validation_best = np.inf
      delta_w = 0
      best_w = w.copy()
      step = 0
      history = defaultdict(list)
      termination = False
      while not termination:
          minibatch_ttl = self.create_mini_batches(x_train)
          for minibatch_idx in minibatch_ttl:
              x_batch = x_train[minibatch_idx]
              y_batch = y_train[minibatch_idx]
              gradient = gradient_function(x_batch, y_batch, w) 
              history["gradient"].append(gradient)
              gradient += (np.linalg.norm(w) ** 2) * self.l2_regularization * 0.5
              delta_w = self.momentum * delta_w + (1 - self.momentum) * gradient
              w -= self.learning_rate * delta_w
              history["cost"].append(self.cost(x_batch, y_batch, w, k))
              history["train_accuracy"].append(self.accuracy(x_batch, y_batch, w))
              history["val_accuracy"].append(self.accuracy(x_validation, y_validation, w))
              validation_error = mean_squared_error(np.argmax(softmax(np.matmul(x_validation, w)), axis=1), y_validation)

              if validation_error < validation_best:
                  validation_best = validation_error
                  best_w = w.copy()
                  step = 0
              else:
                  step += 1
                  if step >= self.termination_step:
                      termination = True
                      history["training_stop"] = [step]
                      break
      return best_w, history

In [None]:
def k_fold_cross_val(x, k = 5):
    lst = list(range(len(x)))
    np.random.shuffle(lst)
    folds = np.array_split(lst, k)
    for i in range(k):
        test = np.concatenate(*[folds[:i] + folds[i+1:]])
        val = folds[i]
        yield test.flatten(), val

In [None]:
def run_k_fold_cross_val(x, y, model, optimizer):
    arr = []
    cost = []
    train_acc = []
    val_acc = []
    converge = []
    for i, (train_id, val_id) in enumerate(k_fold_cross_val(x)):
        x_train, y_train = x[train_id], y[train_id]
        x_val, y_val = x[val_id], y[val_id]
        history = model.fit(x_train, y_train, x_val, y_val, optimizer)
        arr.append(history)
        cost.append(history["cost"])
        train_acc.append(history["train_accuracy"])
        val_acc.append(history["val_accuracy"])
        conv = True
        # check for convergence: L2 distance between each pair of last 20 gradients
        # if 1 run among the 5-fold cross-validation diverges, the set of parameters diverges
        for i in np.arange(len(history["gradient"])-20, len(history["gradient"])-15):
            dist = np.linalg.norm(history["gradient"][i] - history["gradient"][i+1])
            # print(dist)
            if dist > 1:
                conv = False
                break
        converge.append(conv)
        # print(conv)
    # if any(converge):
    return arr, cost, train_acc, val_acc, sum(converge)/5
    # else:
    #   return arr, cost, train_acc, val_acc, False

Softmax Regression Digits Dataset (with sample hyperparameters)

In [None]:
np.seterr(divide='ignore', invalid='ignore')
model1 = SoftmaxRegression(10)
optimizer1 = MiniBatchGradientDescent(batch_size = 250, learning_rate = 0.01, momentum = 0.9)
digits_out, digits_out_cost, digits_out_train_acc, digits_out_val_acc, digits_out_converge = run_k_fold_cross_val(digits_x, digits_y, model1, optimizer1)
print("convergent: ", digits_out_converge)

convergent:  1.0


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Visualize Softmax Regression
def plot(data, title, x_label, y_label):
    fig = go.Figure()
    for i in np.arange(len(data)):
        fig.add_trace(go.Scatter(x=np.arange(len(data[i])), y=data[i], mode='lines', name='run '+str(i+1)))
    fig.update_layout(title=title, xaxis_title=x_label, yaxis_title=y_label, autosize=False, width=700, height=300, margin=dict(l=50,r=50,b=30,t=30,pad=2))
    fig.show()

print('Softmax Regression of Digits Dataset (5-fold cross-validation)')
print('default parameters: batch_size = 250, learning_rate = 0.01, momentum = 0.9')
plot(digits_out_cost, 'Cost', 'Iterations', 'Cost')
plot(digits_out_train_acc, 'Training Accuracy', 'Iterations', 'Training Accuracy')
plot(digits_out_val_acc, 'Validation Accuracy', 'Iterations', 'Validation Accuracy')

Softmax Regression of Digits Dataset (5-fold cross-validation)
default parameters: batch_size = 250, learning_rate = 0.01, momentum = 0.9


Softmax Regression OpenML Iris Dataset (with sample hyperparameters)

In [None]:
model2 = SoftmaxRegression(3)
optimizer2 = MiniBatchGradientDescent(batch_size = 10, learning_rate = 0.01, momentum = 0.9)
iris_out, iris_out_cost, iris_out_train_acc, iris_out_val_acc, iris_out_converge = run_k_fold_cross_val(iris_x, iris_y, model2, optimizer2)
print("convergent: ", iris_out_converge)

convergent:  1.0


In [None]:
# Visualize Softmax Regression
print('Softmax Regression of Iris Dataset (5-fold cross-validation)')
print('default parameters: batch_size = 10, learning_rate = 0.01, momentum = 0.9')
plot(iris_out_cost, 'Cost', 'Iterations', 'Cost')
plot(iris_out_train_acc, 'Training Accuracy', 'Iterations', 'Training Accuracy')
plot(iris_out_val_acc, 'Validation Accuracy', 'Iterations', 'Validation Accuracy')

Softmax Regression of Iris Dataset (5-fold cross-validation)
default parameters: batch_size = 10, learning_rate = 0.01, momentum = 0.9


Softmax Regression OpenML Letters Dataset (with sample hyperparameters)

In [None]:
model3 = SoftmaxRegression(26)
optimizer3 = MiniBatchGradientDescent(batch_size = 30, learning_rate = 0.5, momentum = 0.9)
letter_out, letter_out_cost, letter_out_train_acc, letter_out_val_acc, letter_out_converge = run_k_fold_cross_val(letter_x, letter_y, model3, optimizer3)
print("convergent: ", letter_out_converge)

convergent:  0.8


In [None]:
# Visualize Softmax Regression
print('Softmax Regression of Letters Dataset (5-fold cross-validation)')
print('default parameters: batch_size = 30, learning_rate = 0.01, momentum = 0.9')
plot(letter_out_cost, 'Cost', 'Iterations', 'Cost')
plot(letter_out_train_acc, 'Training Accuracy', 'Iterations', 'Training Accuracy')
plot(letter_out_val_acc, 'Validation Accuracy', 'Iterations', 'Validation Accuracy')

Softmax Regression of Letters Dataset (5-fold cross-validation)
default parameters: batch_size = 30, learning_rate = 0.01, momentum = 0.9


Optimizing hyperparameters

In [None]:
def create_hyperparams_grid(x, y, model, parameters):
    graph_b = []
    graph_l = []
    graph_m = []
    graph_time = []
    graph_train_accu = []
    graph_val_accu = []
    graph_train_accu_avg = []
    graph_val_accu_avg = []
    graph_conv = []
    for batch_size in parameters['batch_size']:
        graph_b_row = []
        graph_l_row = []
        graph_m_row = []
        graph_time_row = []
        graph_train_accu_row = []
        graph_val_accu_row = []
        graph_train_accu_avg_row = []
        graph_val_accu_avg_row = []
        graph_conv_row = []
        for learning_rate in parameters['learning_rate']:
            graph_b_row_row = []
            graph_l_row_row = []
            graph_m_row_row = []
            graph_time_row_row = []
            graph_train_accu_row_row = []
            graph_val_accu_row_row = []
            graph_train_accu_avg_row_row = []
            graph_val_accu_avg_row_row = []
            graph_conv_row_row = []
            for momentum in parameters['momentum']:
                optimizer = MiniBatchGradientDescent(batch_size=batch_size, learning_rate=learning_rate, momentum=momentum)
                start = timeit.default_timer()
                out, out_cost, out_train_accu, out_val_accu, out_conv = run_k_fold_cross_val(x, y, model, optimizer)
                stop = timeit.default_timer()
                graph_time_row_row.append(stop - start)
                graph_train_accu_row_row.append(out_train_accu)
                graph_val_accu_row_row.append(out_val_accu)
                graph_conv_row_row.append(out_conv)
                # average of max training accuracy (among the 5-fold) as the training accuracy of this set of parameters
                train = []
                for row in out_train_accu:
                    train.append(np.amax(row))
                train_acc = np.average(train)
                # average of max validation accuracy (among the 5-fold) as the validation accuracy of this set of parameters
                val = []
                for row in out_val_accu:
                    val.append(np.amax(row))
                val_acc = np.average(val)
                graph_b_row_row.append(batch_size)
                graph_l_row_row.append(learning_rate)
                graph_m_row_row.append(momentum)
                graph_train_accu_avg_row_row.append(train_acc)
                graph_val_accu_avg_row_row.append(val_acc)
            graph_b_row.append(graph_b_row_row)
            graph_l_row.append(graph_l_row_row)
            graph_m_row.append(graph_m_row_row)
            graph_time_row.append(graph_time_row_row)
            graph_train_accu_row.append(graph_train_accu_row_row)
            graph_val_accu_row.append(graph_val_accu_row_row)
            graph_train_accu_avg_row.append(graph_train_accu_avg_row_row)
            graph_val_accu_avg_row.append(graph_val_accu_avg_row_row)
            graph_conv_row.append(graph_conv_row_row)
        graph_b.append(graph_b_row)
        graph_l.append(graph_l_row)
        graph_m.append(graph_m_row)
        graph_time.append(graph_time_row)
        graph_train_accu.append(graph_train_accu_row)
        graph_val_accu.append(graph_val_accu_row)
        graph_train_accu_avg.append(graph_train_accu_avg_row)
        graph_val_accu_avg.append(graph_val_accu_avg_row)
        graph_conv.append(graph_conv_row)
    graph_b=np.array(graph_b)
    graph_l=np.array(graph_l)
    graph_m=np.array(graph_m)
    graph_time=np.array(graph_time)
    graph_train_accu=np.array(graph_train_accu)
    graph_val_accu=np.array(graph_val_accu)
    graph_train_accu_avg=np.array(graph_train_accu_avg)
    graph_val_accu_avg=np.array(graph_val_accu_avg)
    graph_conv=np.array(graph_conv)
    return graph_b, graph_l, graph_m, graph_time, graph_train_accu, graph_val_accu, graph_train_accu_avg, graph_val_accu_avg, graph_conv

def visualize_change_param(graph_b, graph_l, graph_m, graph_time, graph_train_accu, graph_val_accu, graph_train_accu_avg, graph_val_accu_avg, graph_conv):
    z_value = np.max(graph_val_accu_avg)
    pos_z = np.argwhere(graph_val_accu_avg == z_value)[0]
    print('Hyperparameters for maximum validation accuracy:')
    print('validation accuracy: %.4f' %(z_value))
    print('batch size: %.2f' %(graph_b[pos_z[0],pos_z[1],pos_z[2]]))
    print('learning rate: %.2f' %(graph_l[pos_z[0],pos_z[1],pos_z[2]]))
    print('momentum: %.2f' %(graph_m[pos_z[0],pos_z[1],pos_z[2]]))
    print('run time: %f seconds' %(graph_time[pos_z[0],pos_z[1],pos_z[2]]))
    plot(graph_train_accu[pos_z[0],pos_z[1],pos_z[2]], 'Training Accuracy Curve', 'Iterations', 'Training Accuracy')
    plot(graph_val_accu[pos_z[0],pos_z[1],pos_z[2]], 'Validation Accuracy Curve', 'Iterations', 'Validation Accuracy')

    print('Changing batch size:')
    print('learning rate: %.2f' %(graph_l[pos_z[0],pos_z[1],pos_z[2]]))
    print('momentum: %.2f' %(graph_m[pos_z[0],pos_z[1],pos_z[2]]))
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=graph_b[:,pos_z[1],pos_z[2]], y=graph_train_accu_avg[:,pos_z[1],pos_z[2]], name="average training accuracy"),secondary_y=False)
    fig.add_trace(go.Scatter(x=graph_b[:,pos_z[1],pos_z[2]], y=graph_val_accu_avg[:,pos_z[1],pos_z[2]], name="average validation accuracy"),secondary_y=False)
    fig.add_trace(go.Scatter(x=graph_b[:,pos_z[1],pos_z[2]], y=graph_time[:,pos_z[1],pos_z[2]], name="run time"),secondary_y=True)
    fig.update_layout(title='Changing batch size', xaxis_title="batch size", 
                      autosize=False, width=600, height=300, margin=dict(l=50,r=50,b=30,t=30,pad=2),showlegend=False)
    fig.update_yaxes(title_text="average training/validation accuracy", secondary_y=False)
    fig.update_yaxes(title_text="run time", secondary_y=True)
    fig.update_xaxes(tick0=0, dtick=10)
    fig.show()

    print('Changing learning rate:')
    print('batch size: %.2f' %(graph_b[pos_z[0],pos_z[1],pos_z[2]]))
    print('momentum: %.2f' %(graph_m[pos_z[0],pos_z[1],pos_z[2]]))
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=graph_l[pos_z[0],:,pos_z[2]], y=graph_train_accu_avg[pos_z[0],:,pos_z[2]], name="average training accuracy"))
    fig.add_trace(go.Scatter(x=graph_l[pos_z[0],:,pos_z[2]], y=graph_val_accu_avg[pos_z[0],:,pos_z[2]], name="average validation accuracy"))
    fig.add_trace(go.Scatter(x=graph_l[pos_z[0],:,pos_z[2]], y=graph_time[pos_z[0],:,pos_z[2]], name="run time", yaxis="y2"))
    fig.add_trace(go.Scatter(x=graph_l[pos_z[0],:,pos_z[2]], y=graph_conv[pos_z[0],:,pos_z[2]], name="percentage convergence", yaxis="y3"))
    fig.update_layout(title='Changing learning rate', xaxis_title="learning rate", 
                      autosize=False, width=650, height=300, margin=dict(l=50,r=50,b=30,t=30,pad=2),showlegend=False,
                      xaxis=dict(domain=[0, 0.8]),
                      yaxis=dict(title="average training/validation accuracy"),
                      yaxis2=dict(title="run time",anchor="x",overlaying="y",side="right"),
                      yaxis3=dict(title="percentage convergence",anchor="free",overlaying="y",side="right",position=0.93))
    fig.update_xaxes(tick0=0, dtick=0.1)
    fig.show()

    print('Changing momentum:')
    print('batch size: %.2f' %(graph_b[pos_z[0],pos_z[1],pos_z[2]]))
    print('learning rate: %.2f' %(graph_l[pos_z[0],pos_z[1],pos_z[2]]))
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=graph_m[pos_z[0],pos_z[1],:], y=graph_train_accu_avg[pos_z[0],pos_z[1],:], name="average training accuracy"),secondary_y=False)
    fig.add_trace(go.Scatter(x=graph_m[pos_z[0],pos_z[1],:], y=graph_val_accu_avg[pos_z[0],pos_z[1],:], name="average validation accuracy"),secondary_y=False)
    fig.add_trace(go.Scatter(x=graph_m[pos_z[0],pos_z[1],:], y=graph_time[pos_z[0],pos_z[1],:], name="run time"),secondary_y=True)
    fig.update_layout(title='Changing momentum', xaxis_title="momentum", 
                      autosize=False, width=600, height=300, margin=dict(l=50,r=50,b=30,t=30,pad=2),showlegend=False)
    fig.update_yaxes(title_text="average training/validation accuracy", secondary_y=False)
    fig.update_yaxes(title_text="run time", secondary_y=True)
    fig.update_xaxes(tick0=0, dtick=0.1)
    fig.show()

Grid Search - Digits Dataset

In [None]:
parameters = defaultdict(list)
parameters['batch_size'] = [10, 25, 50, 75, 100, 125, 150, 175, 200, 250]
parameters['learning_rate'] = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.6, 0.7]
parameters['momentum'] = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

model1 = SoftmaxRegression(10)
graph_b, graph_l, graph_m, graph_time, graph_train_accu, graph_val_accu, graph_train_accu_avg, graph_val_accu_avg, graph_conv = create_hyperparams_grid(digits_x, digits_y, model1, parameters)

In [None]:
visualize_change_param(graph_b, graph_l, graph_m, graph_time, graph_train_accu, graph_val_accu, graph_train_accu_avg, graph_val_accu_avg, graph_conv)

Hyperparameters for maximum validation accuracy:
validation accuracy: 0.9110
batch size: 75.00
learning rate: 0.10
momentum: 0.90
run time: 0.359253 seconds


Changing batch size:
learning rate: 0.10
momentum: 0.90


Changing learning rate:
batch size: 75.00
momentum: 0.90


Changing momentum:
batch size: 75.00
learning rate: 0.10


Grid Search - OpenML Iris Dataset

In [None]:
parameters = defaultdict(list)
parameters['batch_size'] = [5, 10, 20, 30, 40, 50, 75, 100, 125, 150]
parameters['learning_rate'] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
parameters['momentum'] = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.9]

model2 = SoftmaxRegression(3)
graph_b2, graph_l2, graph_m2, graph_time2, graph_train_accu2, graph_val_accu2, graph_train_accu_avg2, graph_val_accu_avg2, graph_conv2 = create_hyperparams_grid(iris_x, iris_y, model2, parameters)

In [None]:
visualize_change_param(graph_b2, graph_l2, graph_m2, graph_time2, graph_train_accu2, graph_val_accu2, graph_train_accu_avg2, graph_val_accu_avg2, graph_conv2)

Hyperparameters for maximum validation accuracy:
validation accuracy: 0.9800
batch size: 10.00
learning rate: 0.80
momentum: 0.30
run time: 0.063585 seconds


Changing batch size:
learning rate: 0.80
momentum: 0.30


Changing learning rate:
batch size: 10.00
momentum: 0.30


Changing momentum:
batch size: 10.00
learning rate: 0.80


Grid Search - OpenML Letters Dataset

In [None]:
parameters = defaultdict(list)
parameters['batch_size'] = [5, 10, 25, 50, 75, 100, 150, 200]
parameters['learning_rate'] = [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5]
parameters['momentum'] = [0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98]

model3 = SoftmaxRegression(26)
graph_b3, graph_l3, graph_m3, graph_time3, graph_train_accu3, graph_val_accu3, graph_train_accu_avg3, graph_val_accu_avg3, graph_conv3 = create_hyperparams_grid(letter_x, letter_y, model3, parameters)

In [None]:
visualize_change_param(graph_b3, graph_l3, graph_m3, graph_time3, graph_train_accu3, graph_val_accu3, graph_train_accu_avg3, graph_val_accu_avg3, graph_conv3)

Hyperparameters for maximum validation accuracy:
validation accuracy: 0.5112
batch size: 5.00
learning rate: 0.40
momentum: 0.80
run time: 5.487301 seconds


Changing batch size:
learning rate: 0.40
momentum: 0.80


Changing learning rate:
batch size: 5.00
momentum: 0.80


Changing momentum:
batch size: 5.00
learning rate: 0.40


Comparison against another classifier (Decition Trees)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# here all 'err' should actually be 'accuracy'
def DT_5_fold_cross_val(x, y, max):
    avg_val_err = []
    val_err = []
    for a in range(1, max):
        val_err_row = []
        tree = DecisionTreeRegressor(random_state=0, max_depth=a)
        for i, (train_id, val_id) in enumerate(k_fold_cross_val(x)):
            x_train, y_train = x[train_id], y[train_id]
            x_val, y_val = x[val_id], y[val_id]
            probs_test = tree.fit(x_train, y_train).predict(x_val)
            sum = np.count_nonzero(tree.predict(x_val)==y_val)
            acc = sum/len(y_val)
            # err = mean_squared_error(tree.predict(x_val), y_val)
            val_err_row.append(acc)
        avg_err = np.average(val_err_row)
        avg_val_err.append(avg_err)
        val_err.append(val_err_row)
    val_err = np.array(val_err)
    avg_val_err = np.array(avg_val_err)
    return val_err, avg_val_err

def visualize_DT(data, title):
    fig = go.Figure()
    for i in np.arange(len(data)):
        fig.add_trace(go.Scatter(x=np.arange(1,len(data[i])+1), y=data[i], mode='lines', name='k = '+str(i+1)))
    fig.update_layout(title=title, xaxis_title="run", yaxis_title="validation accuracy", autosize=False, width=600, height=300, margin=dict(l=50,r=50,b=30,t=30,pad=2))
    fig.show()

def visualize_DT_average(data, title):
    fig = go.Figure(data=go.Scatter(x=np.arange(1,len(data)+1), y=data, mode='lines'))
    fig.update_layout(title=title, xaxis_title="k value", yaxis_title="average validation accuracy", autosize=False, width=500, height=250, margin=dict(l=50,r=50,b=30,t=30,pad=2))
    # fig.update_xaxes(tick0=1, dtick=2)
    fig.show()

In [None]:
val_err1, avg_val_err1 = DT_5_fold_cross_val(digits_x, digits_y, 25)

print("Digits Dataset: Decision Trees")
# k = Decision Treet max depth
print("Maximum Decision Trees validation accuracy: ",np.max(avg_val_err1),"( k =",(np.argwhere(avg_val_err1==np.max(avg_val_err1))[0,0]+1),")")
visualize_DT(val_err1, "Decision Trees Validation Accuracy (5-fold Cross-Validation)")
visualize_DT_average(avg_val_err1, "Digits Dataset: Decision Trees")

Digits Dataset: Decision Trees
Maximum Decision Trees validation accuracy:  0.8247106159083876 ( k = 17 )


In [None]:
val_err2, avg_val_err2 = DT_5_fold_cross_val(iris_x, iris_y, 14)

print("OpenML Iris Dataset: Decision Trees")
# k = Decision Treet max depth
print("Maximum Decision Trees validation accuracy: ",np.max(avg_val_err2),"( k =",(np.argwhere(avg_val_err2==np.max(avg_val_err2))[0,0]+1),")")
visualize_DT(val_err2, "Decision Trees Validation Accuracy (5-fold Cross-Validation)")
visualize_DT_average(avg_val_err2, "OpenML Iris Dataset: Decision Trees")

OpenML Iris Dataset: Decision Trees
Maximum Decision Trees validation accuracy:  0.9666666666666668 ( k = 10 )


In [None]:
val_err3, avg_val_err3 = DT_5_fold_cross_val(letter_x, letter_y, 40)

print("OpenML Letters Dataset: Decision Trees")
# k = Decision Treet max depth
print("Maximum Decision Trees validation accuracy: ",np.max(avg_val_err3),"( k =",(np.argwhere(avg_val_err3==np.max(avg_val_err3))[0,0]+1),")")
visualize_DT(val_err3, "Decision Trees Validation Accuracy (5-fold Cross-Validation)")
visualize_DT_average(avg_val_err3, "OpenML Letters Dataset: Decision Trees")

OpenML Letters Dataset: Decision Trees
Maximum Decision Trees validation accuracy:  0.8558 ( k = 36 )
