In [1]:
#trying to implement ensemble method
#https://datascience.stackexchange.com/questions/27169/taking-average-of-multiple-neural-networks
#mixture of experts 'with kmeans'
#https://en.wikipedia.org/wiki/Mixture_of_experts
#combining models together university of Tartu
#https://courses.cs.ut.ee/MTAT.03.277/2014_fall/uploads/Main/deep-learning-lecture-9-combining-multiple-neural-networks-to-improve-generalization-andres-viikmaa.pdf
import tensorflow as tf
import numpy as np
import keras
from keras.utils import to_categorical
import math
import time

(trX, trY), (teX, teY) = tf.keras.datasets.fashion_mnist.load_data()
trX = trX.reshape(60000, 784)
teX = teX.reshape(10000, 784)

trY = to_categorical(trY)
teY = to_categorical(teY)


print("x_train shape:", trX.shape, "y_train shape:", trY.shape)
print("x_test shape:", teX.shape, "y_test shape:", teY.shape)

total_data_set = np.vstack((trX, teX))
total_label_set = np.vstack((trY, teY))

Using TensorFlow backend.


x_train shape: (60000, 784) y_train shape: (60000, 10)
x_test shape: (10000, 784) y_test shape: (10000, 10)


In [2]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))


def model(X, w_h1, w_h2, w_o):
    h1 = tf.nn.sigmoid(tf.matmul(X, w_h1)) # this is a basic mlp, think 2 stacked logistic regressions
    h = tf.nn.sigmoid(tf.matmul(h1, w_h2))
    #return tf.matmul(h, w_o, name="insertname_here") if we need to use names and save the models
    return tf.matmul(h, w_o) # note that we dont take the softmax at the end because our cost fn does that for us

#taken from https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def myRange(start,end,step):
    i = start
    while i < end:
        yield i
        i += step
    yield end
    
def confidence_interval(data):
    #using 95 percent confidence interval
    data_points = len(data)
    average_accuracy = 0
    for i in range(0,data_points):
        average_accuracy = average_accuracy + data[i]/data_points

    print("mean accuracy across k folds: " + str(average_accuracy))

    standard_deviation = 0
    variance = 0
    for i in range(0,data_points):
        variance = variance + ((data[i] - average_accuracy) * (data[i] - average_accuracy))
    
    standard_deviation = np.sqrt(variance/data_points)
    standard_error = standard_deviation/np.sqrt(data_points)

    print("confidence interval: " + str(average_accuracy - 1.96*standard_error) + ", " + str(average_accuracy + 1.96*standard_error))
    return [average_accuracy - 1.96*standard_error, average_accuracy, average_accuracy + 1.96*standard_error]

In [6]:
#saver = tf.train.Saver()

prediction = 0

model_accuracy = 0

models_to_train = 2

models_to_train_begin = 2
models_to_train_end = 10

confidence_interval_across_experiments = []
confidence_interval_across_experiments_time = []
kfold_times = []

epochs_per_model = 3

batch_size = 128

number_of_folds = 10

multiple_experts_accuracies = []
multiple_experts_time = []

# Launch the graph in a session
with tf.Session() as sess:
    for n in range(models_to_train_begin, models_to_train_end + 1):
        models_to_train = n
        multiple_experts_accuracies = []
        multiple_experts_time = []
        for fold in range(0, number_of_folds):
            model_accuracy = 0

            cluster_centers = np.load('kmeansclusters/' + str(models_to_train) + 'fold' + str(fold) + '.npy')

            print("fold number: " + str(fold))
            size_of_fold = int(len(total_data_set)/number_of_folds)
            trX = np.vstack((total_data_set[0:fold*size_of_fold], total_data_set[(fold + 1)*size_of_fold:len(total_data_set)]))
            teX = total_data_set[fold*size_of_fold:(fold + 1)*size_of_fold]

            trY = np.vstack((total_label_set[0:fold*size_of_fold], total_label_set[(fold + 1)*size_of_fold:len(total_label_set)]))
            teY = total_label_set[fold*size_of_fold:(fold + 1)*size_of_fold]

            print("x_train shape:", trX.shape, "y_train shape:", trY.shape)
            print("x_test shape:", teX.shape, "y_test shape:", teY.shape)

            partitioned_train_data = []
            partitioned_test_data = []
            partitioned_train_labels = []
            partitioned_test_labels = []

            for i in range(0,models_to_train):
                partitioned_train_data.append([])
                partitioned_test_data.append([])
                partitioned_train_labels.append([])
                partitioned_test_labels.append([])

            for j in range(0,len(teX)):
                closest_index = 0
                closest_distance = math.inf
                for y in range(0, len(cluster_centers)):
                    temp_distance = np.linalg.norm(cluster_centers[y] - teX[j])
                    if closest_distance > temp_distance:
                        closest_index = y
                        closest_distance = temp_distance
                partitioned_test_data[closest_index].append(teX[j])
                partitioned_test_labels[closest_index].append(teY[j])

            for j in range(0,len(trX)):
                closest_index = 0
                closest_distance = math.inf
                for y in range(0, len(cluster_centers)):
                    temp_distance = np.linalg.norm(cluster_centers[y] - trX[j])
                    if closest_distance > temp_distance:
                        closest_index = y
                        closest_distance = temp_distance
                partitioned_train_data[closest_index].append(trX[j])
                partitioned_train_labels[closest_index].append(trY[j])

            for i in range(0,models_to_train):
                partitioned_train_data[i] = np.vstack(partitioned_train_data[i])
                partitioned_test_data[i] = np.vstack(partitioned_test_data[i])
                partitioned_train_labels[i] = np.vstack(partitioned_train_labels[i])
                partitioned_test_labels[i] = np.vstack(partitioned_test_labels[i])

                print(partitioned_train_data[i].shape)
                print(partitioned_test_data[i].shape)
                print(partitioned_train_labels[i].shape)
                print(partitioned_test_labels[i].shape)

            average_model_accuracy = 0
            aggregated_model_times = 0
            for z in range(0,models_to_train):
                size_h1 = tf.constant(625, dtype=tf.int32)
                size_h2 = tf.constant(300, dtype=tf.int32)

                X = tf.placeholder("float", [None, 784])
                Y = tf.placeholder("float", [None, 10])

                w_h1 = init_weights([784, size_h1]) # create symbolic variables
                w_h2 = init_weights([size_h1, size_h2])
                w_o = init_weights([size_h2, 10])

                py_x = model(X, w_h1, w_h2, w_o)

                trX = partitioned_train_data[z]
                teX = partitioned_test_data[z]
                trY = partitioned_train_labels[z]
                teY = partitioned_test_labels[z]

                cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y)) # compute costs
                train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost) # construct an optimizer
                predict_op = tf.argmax(py_x, 1)
                tf.global_variables_initializer().run()
                for i in range(epochs_per_model):
                    time_start = time.process_time()
                    for start, end in zip(myRange(0, len(trX), batch_size), myRange(batch_size, len(trX)+1, batch_size)):
                        sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
                    time_stop = time.process_time()
                    aggregated_model_times = aggregated_model_times + (time_stop - time_start)
                    print(i, np.mean(np.argmax(teY, axis=1) ==
                                     sess.run(predict_op, feed_dict={X: teX})))
                model_accuracy = np.sum(np.argmax(teY, axis=1) == sess.run(predict_op, feed_dict={X: teX})) + model_accuracy


            print("accuracy: " + str(model_accuracy/(len(total_data_set)/number_of_folds)))
            multiple_experts_accuracies.append(model_accuracy/(len(total_data_set)/number_of_folds))
            multiple_experts_time.append(aggregated_model_times)

        answer = confidence_interval(multiple_experts_accuracies)
        print(answer)
        confidence_interval_across_experiments.append(answer)
        
        kfold_times.append(multiple_experts_time)
        answer = confidence_interval(multiple_experts_time)
        print(answer)
        confidence_interval_across_experiments_time.append(answer)

print(confidence_interval_across_experiments)
print(confidence_interval_across_experiments_time)
confidence_interval_across_experiments = np.asarray(confidence_interval_across_experiments)
confidence_interval_across_experiments_time = np.asarray(confidence_interval_across_experiments_time)
kfold_times = np.asarray(kfold_times)
np.save("mixtureOfExpertsResultsTimeAllTimes.npy", kfold_times)
np.save("mixtureOfExpertsResultsAccuracy.npy", confidence_interval_across_experiments)
np.save("mixtureOfExpertsResultsTime.npy", confidence_interval_across_experiments_time)

        #saver.save(sess,"mlp/session.ckpt")

fold number: 0
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(30049, 784)
(3303, 784)
(30049, 10)
(3303, 10)
(32951, 784)
(3697, 784)
(32951, 10)
(3697, 10)
0 0.6400242204056918
0 0.5739789018122802
accuracy: 0.6051428571428571
fold number: 1
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(29871, 784)
(3281, 784)
(29871, 10)
(3281, 10)
(33129, 784)
(3719, 784)
(33129, 10)
(3719, 10)
0 0.6531545260591283
0 0.5856413014251143
accuracy: 0.6172857142857143
mean accuracy across k folds: 0.6112142857142857
confidence interval: 0.6027997150181658, 0.6196288564104057
[0.6027997150181658, 0.6112142857142857, 0.6196288564104057]
mean accuracy across k folds: 21.4765625
confidence interval: 18.498980038597274, 24.454144961402726
[18.498980038597274, 21.4765625, 24.454144961402726]
fold number: 0
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_te

(9509, 784)
(1042, 784)
(9509, 10)
(1042, 10)
(8405, 784)
(942, 784)
(8405, 10)
(942, 10)
(10005, 784)
(1147, 784)
(10005, 10)
(1147, 10)
(6082, 784)
(715, 784)
(6082, 10)
(715, 10)
0 0.6690058479532164
0 0.38144329896907214
0 0.4180790960451977
0 0.9346846846846847
0 0.9222648752399232
0 0.4830148619957537
0 0.37401918047079336
0 0.862937062937063
accuracy: 0.5984285714285714
mean accuracy across k folds: 0.6046428571428571
confidence interval: 0.596030296548005, 0.6132554177377093
[0.596030296548005, 0.6046428571428571, 0.6132554177377093]
mean accuracy across k folds: 27.609375
confidence interval: 26.699858902698804, 28.518891097301196
[26.699858902698804, 27.609375, 28.518891097301196]
fold number: 0
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(8273, 784)
(904, 784)
(8273, 10)
(904, 10)
(9420, 784)
(1132, 784)
(9420, 10)
(1132, 10)
(10178, 784)
(1097, 784)
(10178, 10)
(1097, 10)
(6139, 784)
(683, 784)
(6139, 10)
(683, 1