## Notebook settings

In [33]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import KFold  # import sklearn Kfold to implement cross-validation


# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

reset_graph()

## Prepare data

In [34]:
# load data: digits 5 to 9, but still label with 0 to 4, 
# because TensorFlow expects label's integers from 0 to n_classes-1.
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

X_train2_full = mnist.train.images[mnist.train.labels >= 5]
y_train2_full = mnist.train.labels[mnist.train.labels >= 5] - 5

X_valid2_full = mnist.validation.images[mnist.validation.labels >= 5]
y_valid2_full = mnist.validation.labels[mnist.validation.labels >= 5] - 5

X_test2 = mnist.test.images[mnist.test.labels >= 5]
y_test2 = mnist.test.labels[mnist.test.labels >= 5] - 5

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [35]:
# we want to keep only 100 instances per class in the training set 
# and let's keep only 30 instances per class in the validation set
# tesing set is already loaded above
def sample_n_instances_per_class(X, y, n=100):
    Xs, ys = [], []
    for label in np.unique(y):
        idx = (y == label)
        Xc = X[idx][:n]
        yc = y[idx][:n]
        Xs.append(Xc)
        ys.append(yc)
    return np.concatenate(Xs), np.concatenate(ys)

X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)
X_valid2, y_valid2 = sample_n_instances_per_class(X_valid2_full, y_valid2_full, n=30)

In [36]:
y_train_one_hot = np.zeros((len(y_train2), 5))
y_train_one_hot[np.arange(len(y_train2)), y_train2] = 1

y_valid_one_hot = np.zeros((len(y_valid2), 5))
y_valid_one_hot[np.arange(len(y_valid2)), y_valid2] = 1

y_test_one_hot = np.zeros((len(y_test2), 5))
y_test_one_hot[np.arange(len(y_test2)), y_test2] = 1

print ("Train Label: ", y_train_one_hot.shape)
print ("Validation Label: ", y_valid_one_hot.shape)
print ("Test Label: ", y_test_one_hot.shape)

Train Label:  (500, 5)
Validation Label:  (150, 5)
Test Label:  (4861, 5)


## Define layer

In [37]:
pretrained_weights_path = './pretrained_hw2_weights/Team59_HW2.ckpt.meta'

tf.reset_default_graph()
saver = tf.train.import_meta_graph(pretrained_weights_path)
graph = tf.get_default_graph()

def layer(output_dim, input_dim, inputs, layer, dropout_rate, activation=None, use_pretrained=False):
    """
    layer construction function
    """
    W_name = 'W' + layer
    b_name = 'b' + layer
    
    if use_pretrained:
        W = graph.get_tensor_by_name(W_name + ":0")
        b = graph.get_tensor_by_name(b_name + ":0")
    else:
        he_init = tf.contrib.layers.variance_scaling_initializer()
        W = tf.get_variable(W_name, shape=[input_dim, output_dim], initializer=he_init)
        b = tf.get_variable(b_name, shape=[1, output_dim], initializer=he_init)

    XWb = tf.matmul(inputs, W) + b
    XWb = tf.nn.dropout(XWb, (1-dropout_rate))
    
    if activation is None:
        outputs = XWb
    else:
        outputs = activation(XWb)
        
    return outputs


## Hyper parameters

In [38]:
lr = 5e-3
batch_size = 16
epochs = 1000
saturate_limit = 20  # for applying early stopping
fold_num = 5


## HW 3.1 - Softmax-only transfer-learning

### Model

In [39]:
class ModelHw31():
    def __init__(self):
        self.define_layers()
        
    def define_layers(self):
        self.x = tf.placeholder("float", [None, 784])  # input layer
        
        # hidden layers
        self.h1 = layer(output_dim=128, input_dim=784, inputs=self.x, layer="h1", dropout_rate=0, activation=tf.nn.elu, use_pretrained=True)
        self.h2 = layer(output_dim=128, input_dim=128, inputs=self.h1, layer="h2", dropout_rate=0, activation=tf.nn.elu, use_pretrained=True)
        self.h3 = layer(output_dim=128, input_dim=128, inputs=self.h2, layer="h3", dropout_rate=0, activation=tf.nn.elu, use_pretrained=True)
        self.h4 = layer(output_dim=128, input_dim=128, inputs=self.h3, layer="h4", dropout_rate=0, activation=tf.nn.elu, use_pretrained=True)
        self.h5 = layer(output_dim=128, input_dim=128, inputs=self.h4, layer="h5", dropout_rate=0, activation=tf.nn.elu, use_pretrained=True)
        
        # output layers
        self.y_predict = layer(output_dim=5, input_dim=128, inputs=self.h5, layer="output", dropout_rate=0, activation=None, use_pretrained=False)
        self.y_label = tf.placeholder("float", [None, 5])
        
model = ModelHw31()

### Training

In [40]:
from sklearn.model_selection import KFold  # import sklearn Kfold to implement cross-validation


# compare prediction and label
correct = tf.equal(tf.argmax(model.y_label, 1), tf.argmax(model.y_predict, 1))
accuracy = tf.reduce_mean(tf.cast(correct, "float"))

# merge training and validation data for cross validation
X_data = np.concatenate((X_train2, X_valid2), axis=0)
y_data = np.concatenate((y_train_one_hot, y_valid_one_hot), axis=0)

# set loss function: tf.nn.sparse_softmax_cross_entropy_with_logits required by the homework spec
loss_function = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model.y_predict, labels=model.y_label))

# set optimizer
# TODO: train on softmax layer only
output_layer_vars = [tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='Woutput'),
                     tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='boutput')]
optimizer = tf.train.AdamOptimizer(learning_rate=lr, name="training_op").minimize(loss_function, var_list=output_layer_vars)
    
# cross validation
kf = KFold(n_splits=fold_num, shuffle=True)
iterations = int(X_train2.shape[0] / batch_size)
for fold_index, fold_indices in enumerate(kf.split(X_data), 1):
    # get data of this fold
    train_fold_indices, test_fold_indices = fold_indices
    X_train_fold, X_valid_fold = X_data[train_fold_indices], X_data[test_fold_indices]
    y_train_fold, y_valid_fold = y_data[train_fold_indices], y_data[test_fold_indices]

    # start training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        saturate_count = 0
        best_acc = 0.
        best_loss = 1000.
        best_epoch = -1
        for e in range(1, epochs + 1):
            for i in range(iterations):
                if(i == 0):
                    batch_x = X_train_fold[:batch_size]
                    batch_y = y_train_fold[:batch_size]
                else:
                    batch_x = X_train_fold[i * batch_size : (i + 1) * batch_size]
                    batch_y = y_train_fold[i * batch_size : (i + 1) * batch_size]

                sess.run(optimizer, feed_dict={model.x: batch_x, model.y_label: batch_y})

            # validate
            loss, acc = sess.run([loss_function, accuracy], feed_dict={model.x: X_valid_fold, model.y_label: y_valid_fold})
            #print('epoch', e, '-', 'loss:', loss, '/', 'acc:', acc)

            if best_loss > loss:
                # TODO: save weights!!
                best_acc = acc
                best_loss = loss
                best_epoch = e
                saturate_count = 0
            else:
                saturate_count += 1

                if saturate_count >= saturate_limit:  # stop if saturate
                    break

        # print the best result of this fold cross all epochs
        print('*' * 60)
        print('Fold:', fold_index)
        print('Best epoch:', best_epoch)
        print('Best accurancy:', best_acc)
        print('Best loss:', best_loss)
        print('*' * 60, '\n')
    
# TODO: restore the best model cross all folds
# TODO: test the best model on the whole test set
# TODO: print the testing result on the whole test set


************************************************************
Fold: 1
Best epoch: 525
Best accurancy: 0.753846
Best loss: 1.008
************************************************************ 

************************************************************
Fold: 2
Best epoch: 877
Best accurancy: 0.784615
Best loss: 0.793813
************************************************************ 

************************************************************
Fold: 3
Best epoch: 312
Best accurancy: 0.769231
Best loss: 0.864931
************************************************************ 

************************************************************
Fold: 4
Best epoch: 112
Best accurancy: 0.715385
Best loss: 0.867756
************************************************************ 

************************************************************
Fold: 5
Best epoch: 487
Best accurancy: 0.730769
Best loss: 0.898377
************************************************************ 



## HW 3.2 - Caching the 5th layer

## HW 3.3 - 4-layers-only transfer-learning

## HW 3.4 - Bonus