In [1]:
#trying to implement ensemble method
#https://datascience.stackexchange.com/questions/27169/taking-average-of-multiple-neural-networks
#mixture of experts 'with kmeans'
#https://en.wikipedia.org/wiki/Mixture_of_experts
#combining models together university of Tartu
#https://courses.cs.ut.ee/MTAT.03.277/2014_fall/uploads/Main/deep-learning-lecture-9-combining-multiple-neural-networks-to-improve-generalization-andres-viikmaa.pdf
import tensorflow as tf
import numpy as np
import keras
from keras.utils import to_categorical
import math
import time

(trX, trY), (teX, teY) = tf.keras.datasets.fashion_mnist.load_data()
trX = trX.reshape(60000, 784)
teX = teX.reshape(10000, 784)

trY = to_categorical(trY)
teY = to_categorical(teY)


print("x_train shape:", trX.shape, "y_train shape:", trY.shape)
print("x_test shape:", teX.shape, "y_test shape:", teY.shape)

total_data_set = np.vstack((trX, teX))
total_label_set = np.vstack((trY, teY))

Using TensorFlow backend.


x_train shape: (60000, 784) y_train shape: (60000, 10)
x_test shape: (10000, 784) y_test shape: (10000, 10)


In [2]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))


def model(X, w_h1, w_h2, w_o):
    h1 = tf.nn.sigmoid(tf.matmul(X, w_h1)) # this is a basic mlp, think 2 stacked logistic regressions
    h = tf.nn.sigmoid(tf.matmul(h1, w_h2))
    #return tf.matmul(h, w_o, name="insertname_here") if we need to use names and save the models
    return tf.matmul(h, w_o) # note that we dont take the softmax at the end because our cost fn does that for us

#taken from https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def myRange(start,end,step):
    i = start
    while i < end:
        yield i
        i += step
    yield end
    
def confidence_interval(data):
    #using 95 percent confidence interval
    data_points = len(data)
    average_accuracy = 0
    for i in range(0,data_points):
        average_accuracy = average_accuracy + data[i]/data_points

    print("mean accuracy across k folds: " + str(average_accuracy))

    standard_deviation = 0
    variance = 0
    for i in range(0,data_points):
        variance = variance + ((data[i] - average_accuracy) * (data[i] - average_accuracy))
    
    standard_deviation = np.sqrt(variance/data_points)
    standard_error = standard_deviation/np.sqrt(data_points)

    print("confidence interval: " + str(average_accuracy - 1.96*standard_error) + ", " + str(average_accuracy + 1.96*standard_error))
    return [average_accuracy - 1.96*standard_error, average_accuracy, average_accuracy + 1.96*standard_error]

In [3]:
#saver = tf.train.Saver()

prediction = 0

model_accuracy = 0

models_to_train = 2

models_to_train_begin = 2
models_to_train_end = 10

confidence_interval_across_experiments = []
confidence_interval_across_experiments_time = []
kfold_times = []

epochs_per_model = 1

batch_size = 128

number_of_folds = 10

multiple_experts_accuracies = []
multiple_experts_time = []

# Launch the graph in a session
with tf.Session() as sess:
    for n in range(models_to_train_begin, models_to_train_end + 1):
        models_to_train = n
        multiple_experts_accuracies = []
        multiple_experts_time = []
        for fold in range(0,number_of_folds):
            model_accuracy = 0

            cluster_centers = np.load('kmeansclusters/' + str(models_to_train) + 'fold' + str(fold) + '.npy')

            print("fold number: " + str(fold))
            size_of_fold = int(len(total_data_set)/number_of_folds)
            trX = np.vstack((total_data_set[0:fold*size_of_fold], total_data_set[(fold + 1)*size_of_fold:len(total_data_set)]))
            teX = total_data_set[fold*size_of_fold:(fold + 1)*size_of_fold]

            trY = np.vstack((total_label_set[0:fold*size_of_fold], total_label_set[(fold + 1)*size_of_fold:len(total_label_set)]))
            teY = total_label_set[fold*size_of_fold:(fold + 1)*size_of_fold]

            print("x_train shape:", trX.shape, "y_train shape:", trY.shape)
            print("x_test shape:", teX.shape, "y_test shape:", teY.shape)

            partitioned_train_data = []
            partitioned_test_data = []
            partitioned_train_labels = []
            partitioned_test_labels = []

            for i in range(0,models_to_train):
                partitioned_train_data.append([])
                partitioned_test_data.append([])
                partitioned_train_labels.append([])
                partitioned_test_labels.append([])

            for j in range(0,len(teX)):
                closest_index = 0
                closest_distance = math.inf
                for y in range(0, len(cluster_centers)):
                    temp_distance = np.linalg.norm(cluster_centers[y] - teX[j])
                    if closest_distance > temp_distance:
                        closest_index = y
                        closest_distance = temp_distance
                partitioned_test_data[closest_index].append(teX[j])
                partitioned_test_labels[closest_index].append(teY[j])

            for j in range(0,len(trX)):
                closest_index = 0
                closest_distance = math.inf
                for y in range(0, len(cluster_centers)):
                    temp_distance = np.linalg.norm(cluster_centers[y] - trX[j])
                    if closest_distance > temp_distance:
                        closest_index = y
                        closest_distance = temp_distance
                partitioned_train_data[closest_index].append(trX[j])
                partitioned_train_labels[closest_index].append(trY[j])

            for i in range(0,models_to_train):
                partitioned_train_data[i] = np.vstack(partitioned_train_data[i])
                partitioned_test_data[i] = np.vstack(partitioned_test_data[i])
                partitioned_train_labels[i] = np.vstack(partitioned_train_labels[i])
                partitioned_test_labels[i] = np.vstack(partitioned_test_labels[i])

                print(partitioned_train_data[i].shape)
                print(partitioned_test_data[i].shape)
                print(partitioned_train_labels[i].shape)
                print(partitioned_test_labels[i].shape)

            average_model_accuracy = 0
            aggregated_model_times = 0
            for z in range(0,models_to_train):
                size_h1 = tf.constant(625, dtype=tf.int32)
                size_h2 = tf.constant(300, dtype=tf.int32)

                X = tf.placeholder("float", [None, 784])
                Y = tf.placeholder("float", [None, 10])

                w_h1 = init_weights([784, size_h1]) # create symbolic variables
                w_h2 = init_weights([size_h1, size_h2])
                w_o = init_weights([size_h2, 10])

                py_x = model(X, w_h1, w_h2, w_o)

                trX = partitioned_train_data[z]
                teX = partitioned_test_data[z]
                trY = partitioned_train_labels[z]
                teY = partitioned_test_labels[z]

                cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y)) # compute costs
                train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost) # construct an optimizer
                predict_op = tf.argmax(py_x, 1)
                tf.global_variables_initializer().run()
                time_start = time.process_time()
                for i in range(epochs_per_model):
                    for start, end in zip(myRange(0, len(trX), batch_size), myRange(batch_size, len(trX)+1, batch_size)):
                        sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})
                    time_stop = time.process_time()
                    aggregated_model_times = aggregated_model_times + (time_stop - time_start)
                    print(i, np.mean(np.argmax(teY, axis=1) ==
                                     sess.run(predict_op, feed_dict={X: teX})))
                model_accuracy = np.sum(np.argmax(teY, axis=1) == sess.run(predict_op, feed_dict={X: teX})) + model_accuracy


            print("accuracy: " + str(model_accuracy/(len(total_data_set)/number_of_folds)))
            multiple_experts_accuracies.append(model_accuracy/(len(total_data_set)/number_of_folds))
            multiple_experts_time.append(aggregated_model_times)

        answer = confidence_interval(multiple_experts_accuracies)
        print(answer)
        confidence_interval_across_experiments.append(answer)
        
        kfold_times.append(multiple_experts_time)
        answer = confidence_interval(multiple_experts_time)
        print(answer)
        confidence_interval_across_experiments_time.append(answer)

print(confidence_interval_across_experiments)
print(confidence_interval_across_experiments_time)
confidence_interval_across_experiments = np.asarray(confidence_interval_across_experiments)
confidence_interval_across_experiments_time = np.asarray(confidence_interval_across_experiments_time)
kfold_times = np.asarray(kfold_times)
np.save("mixtureOfExpertsResultsTimeAllTimes.npy", kfold_times)
np.save("mixtureOfExpertsResultsAccuracy.npy", confidence_interval_across_experiments)
np.save("mixtureOfExpertsResultsTime.npy", confidence_interval_across_experiments_time)

        #saver.save(sess,"mlp/session.ckpt")

fold number: 0
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(30049, 784)
(3303, 784)
(30049, 10)
(3303, 10)
(32951, 784)
(3697, 784)
(32951, 10)
(3697, 10)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

0 0.6418407508325764
0 0.6294292669732215
accuracy: 0.6352857142857142
fold number: 1
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(29871, 784)
(3281, 784)
(29871, 10)
(3281, 10)
(33129, 784)
(3719, 784)
(33129, 10)
(3719, 10)
0 0.6470588235294118
0 0.6060769023931164
accuracy: 0.6252857142857143
fold number: 2
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(29837, 784)
(3340, 784)
(29837, 10)
(3340, 10)
(33163, 784)
(3660, 784)
(33163, 10)
(3660, 10)
0 0.594311377245

0 0.6446104589114194
0 0.601593625498008
0 0.8666134185303515
accuracy: 0.6247142857142857
fold number: 2
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(18402, 784)
(2124, 784)
(18402, 10)
(2124, 10)
(16739, 784)
(1817, 784)
(16739, 10)
(1817, 10)
(16851, 784)
(1893, 784)
(16851, 10)
(1893, 10)
(11008, 784)
(1166, 784)
(11008, 10)
(1166, 10)
0 0.6332391713747646
0 0.4215740231150248
0 0.7554146856840993
0 0.8619210977701544
accuracy: 0.6494285714285715
fold number: 3
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(18439, 784)
(2073, 784)
(18439, 10)
(2073, 10)
(16869, 784)
(1887, 784)
(16869, 10)
(1887, 10)
(10944, 784)
(1223, 784)
(10944, 10)
(1223, 10)
(16748, 784)
(1817, 784)
(16748, 10)
(1817, 10)
0 0.4399421128798842
0 0.7556968733439322
0 0.8577269010629599
0 0.43863511282333517
accuracy: 0.5977142857142858
fold number: 4
x_train shape: (63000, 784) y_train shape

(9337, 784)
(1031, 784)
(9337, 10)
(1031, 10)
(18361, 784)
(2094, 784)
(18361, 10)
(2094, 10)
(11538, 784)
(1224, 784)
(11538, 10)
(1224, 10)
(13762, 784)
(1538, 784)
(13762, 10)
(1538, 10)
(10002, 784)
(1113, 784)
(10002, 10)
(1113, 10)
0 0.45295829291949563
0 0.625119388729704
0 0.6209150326797386
0 0.32639791937581275
0 0.8382749326145552
accuracy: 0.5672857142857143
mean accuracy across k folds: 0.6044428571428572
confidence interval: 0.5807902824500251, 0.6280954318356893
[0.5807902824500251, 0.6044428571428572, 0.6280954318356893]
mean accuracy across k folds: 11.134375
confidence interval: 10.65693527703751, 11.61181472296249
[10.65693527703751, 11.134375, 11.61181472296249]
fold number: 0
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(11607, 784)
(1308, 784)
(11607, 10)
(1308, 10)
(11884, 784)
(1287, 784)
(11884, 10)
(1287, 10)
(9222, 784)
(1101, 784)
(9222, 10)
(1101, 10)
(12174, 784)
(1296, 784)
(12174, 10)
(1296, 10

0 0.8582995951417004
0 0.2185672514619883
0 0.417296389588581
0 0.8279266572637518
0 0.9074446680080482
0 0.46313603322949115
0 0.6222741433021807
accuracy: 0.561
fold number: 3
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(11023, 784)
(1195, 784)
(11023, 10)
(1195, 10)
(11609, 784)
(1280, 784)
(11609, 10)
(1280, 10)
(4650, 784)
(510, 784)
(4650, 10)
(510, 10)
(8573, 784)
(993, 784)
(8573, 10)
(993, 10)
(6657, 784)
(721, 784)
(6657, 10)
(721, 10)
(9269, 784)
(1013, 784)
(9269, 10)
(1013, 10)
(11219, 784)
(1288, 784)
(11219, 10)
(1288, 10)
0 0.3707112970711297
0 0.2171875
0 0.9372549019607843
0 0.47633434038267874
0 0.8141470180305131
0 0.9170779861796644
0 0.8027950310559007
accuracy: 0.6031428571428571
fold number: 4
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(4619, 784)
(528, 784)
(4619, 10)
(528, 10)
(11268, 784)
(1256, 784)
(11268, 10)
(1256, 10)
(9252, 784)
(

(8065, 784)
(923, 784)
(8065, 10)
(923, 10)
(9518, 784)
(1022, 784)
(9518, 10)
(1022, 10)
(4005, 784)
(451, 784)
(4005, 10)
(451, 10)
(8390, 784)
(962, 784)
(8390, 10)
(962, 10)
(10057, 784)
(1099, 784)
(10057, 10)
(1099, 10)
(6132, 784)
(684, 784)
(6132, 10)
(684, 10)
(7907, 784)
(838, 784)
(7907, 10)
(838, 10)
(8926, 784)
(1021, 784)
(8926, 10)
(1021, 10)
0 0.05200433369447454
0 0.9285714285714286
0 0.9223946784922394
0 0.4802494802494803
0 0.3821656050955414
0 0.868421052631579
0 0.48329355608591884
0 0.23898139079333985
accuracy: 0.5054285714285714
fold number: 6
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(8407, 784)
(937, 784)
(8407, 10)
(937, 10)
(10050, 784)
(1107, 784)
(10050, 10)
(1107, 10)
(9422, 784)
(1113, 784)
(9422, 10)
(1113, 10)
(8966, 784)
(984, 784)
(8966, 10)
(984, 10)
(6129, 784)
(683, 784)
(6129, 10)
(683, 10)
(7910, 784)
(836, 784)
(7910, 10)
(836, 10)
(3990, 784)
(447, 784)
(3990, 10)
(447, 10)
(8126,

(5269, 784)
(537, 784)
(5269, 10)
(537, 10)
(2470, 784)
(292, 784)
(2470, 10)
(292, 10)
0 0.9489795918367347
0 0.5630182421227198
0 0.4206060606060606
0 0.922509225092251
0 0.47880870561282934
0 0.8406593406593407
0 0.3324720068906115
0 0.4301675977653631
0 0.9178082191780822
accuracy: 0.6028571428571429
fold number: 7
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(7830, 784)
(891, 784)
(7830, 10)
(891, 10)
(7062, 784)
(771, 784)
(7062, 10)
(771, 10)
(8149, 784)
(898, 784)
(8149, 10)
(898, 10)
(5659, 784)
(592, 784)
(5659, 10)
(592, 10)
(10373, 784)
(1221, 784)
(10373, 10)
(1221, 10)
(9470, 784)
(1047, 784)
(9470, 10)
(1047, 10)
(3568, 784)
(402, 784)
(3568, 10)
(402, 10)
(8097, 784)
(871, 784)
(8097, 10)
(871, 10)
(2792, 784)
(307, 784)
(2792, 10)
(307, 10)
0 0.468013468013468
0 0.7211413748378729
0 0.2583518930957684
0 0.7972972972972973
0 0.35544635544635544
0 0.9283667621776505
0 0.9079601990049752
0 0.5074626865671642
0 0

0 0.47214854111405835
0 0.5270425776754891
0 0.9369287020109689
0 0.6818181818181818
0 0.43897216274089934
0 0.2713004484304933
0 0.8651026392961877
0 0.9551724137931035
0 0.4779951100244499
0 0.9492753623188406
accuracy: 0.6272857142857143
fold number: 7
x_train shape: (63000, 784) y_train shape: (63000, 10)
x_test shape: (7000, 784) y_test shape: (7000, 10)
(2700, 784)
(299, 784)
(2700, 10)
(299, 10)
(8033, 784)
(864, 784)
(8033, 10)
(864, 10)
(9506, 784)
(1048, 784)
(9506, 10)
(1048, 10)
(2491, 784)
(234, 784)
(2491, 10)
(234, 10)
(7918, 784)
(884, 784)
(7918, 10)
(884, 10)
(4534, 784)
(514, 784)
(4534, 10)
(514, 10)
(7808, 784)
(889, 784)
(7808, 10)
(889, 10)
(3024, 784)
(329, 784)
(3024, 10)
(329, 10)
(6873, 784)
(756, 784)
(6873, 10)
(756, 10)
(10113, 784)
(1183, 784)
(10113, 10)
(1183, 10)
0 0.9565217391304348
0 0.4965277777777778
0 0.9293893129770993
0 0.905982905982906
0 0.2658371040723982
0 0.6867704280155642
0 0.46906636670416196
0 0.9240121580547113
0 0.7195767195767195
0 0