In [1]:
import tensorflow as tf
import numpy as np
import math
import timeit
import matplotlib.pyplot as plt
#%matplotlib inline

from cs231n.data_utils import load_CIFAR10

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=10000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. These are the same steps as
    we used for the SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

def early_stopping(accuracy,EARLY_STOPPING):
        stop_cnt=0
        stop_max = np.argmax(accuracy)
        stop_len = len(accuracy)
        for i in range(stop_len-1,max(stop_max,stop_len-1-EARLY_STOPPING),-1):
            if accuracy[i]<accuracy[stop_max]:
                stop_cnt=stop_cnt+1
        if stop_cnt>=EARLY_STOPPING:
            return 1
        else:
            return 0

def run_model(session, predict, loss_val, Xd, yd,
              epochs=1, batch_size=64, print_every=100,
              training=None, plot_losses=False):
    # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict,1), y)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # shuffle indicies
    train_indicies = np.arange(Xd.shape[0])
    np.random.shuffle(train_indicies)

    training_now = training is not None
    
    # setting up variables we want to compute (and optimizing)
    # if we have a training function, add that to things we compute
    variables = [mean_loss,correct_prediction,accuracy]
    if training_now:
        variables[-1] = training
    #counter
    iter_cnt=0
    
    #add by ljj
    EARLY_STOPPING = 5
    eStopAcc = []
    for e in range(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
            # generate indicies for the batch
            start_idx = (i*batch_size)%X_train.shape[0]
            idx = train_indicies[start_idx:start_idx+batch_size]
            
            # create a feed dictionary for this batch
            feed_dict = {X: Xd[idx,:],
                         y: yd[idx],
                         is_training: training_now }
            # get batch size
            actual_batch_size = yd[i:i+batch_size].shape[0]
            
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            loss, corr, _ = session.run(variables,feed_dict=feed_dict)
            
            # aggregate performance stats
            losses.append(loss*actual_batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if training_now and (iter_cnt % print_every) == 0:
                pass
#                print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}".format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
            iter_cnt += 1
        total_correct = correct/Xd.shape[0]
        total_loss = np.sum(losses)/Xd.shape[0]
#        print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}".format(total_loss,total_correct,e+1))
        if plot_losses:
            plt.plot(losses)
            plt.grid(True)
            plt.title('Epoch {} Loss'.format(e+1))
            plt.xlabel('minibatch number')
            plt.ylabel('minibatch loss')
            plt.show()
        #for early stopping add bi ljj 
        eStopAcc.append(total_correct)
        if early_stopping(eStopAcc,EARLY_STOPPING)==1:
            break
    return total_loss,total_correct

# Feel free to play with this cell

Train data shape:  (49000, 32, 32, 3)
Train labels shape:  (49000,)
Validation data shape:  (1000, 32, 32, 3)
Validation labels shape:  (1000,)
Test data shape:  (10000, 32, 32, 3)
Test labels shape:  (10000,)


## Train a _great_ model on CIFAR-10!

Now it's your job to experiment with architectures, hyperparameters, loss functions, and optimizers to train a model that achieves ** >= 70% accuracy on the validation set** of CIFAR-10. You can use the `run_model` function from above.

### Things you should try:
- **Filter size**: Above we used 7x7; this makes pretty pictures but smaller filters may be more efficient
- **Number of filters**: Above we used 32 filters. Do more or fewer do better?
- **Pooling vs Strided Convolution**: Do you use max pooling or just stride convolutions?
- **Batch normalization**: Try adding spatial batch normalization after convolution layers and vanilla batch normalization after affine layers. Do your networks train faster?
- **Network architecture**: The network above has two layers of trainable parameters. Can you do better with a deep network? Good architectures to try include:
    - [conv-relu-pool]xN -> [affine]xM -> [softmax or SVM]
    - [conv-relu-conv-relu-pool]xN -> [affine]xM -> [softmax or SVM]
    - [batchnorm-relu-conv]xN -> [affine]xM -> [softmax or SVM]
- **Use TensorFlow Scope**: Use TensorFlow scope and/or [tf.layers](https://www.tensorflow.org/api_docs/python/tf/layers) to make it easier to write deeper networks. See [this tutorial](https://www.tensorflow.org/tutorials/layers) for making how to use `tf.layers`. 
- **Use Learning Rate Decay**: [As the notes point out](http://cs231n.github.io/neural-networks-3/#anneal), decaying the learning rate might help the model converge. Feel free to decay every epoch, when loss doesn't change over an entire epoch, or any other heuristic you find appropriate. See the [Tensorflow documentation](https://www.tensorflow.org/versions/master/api_guides/python/train#Decaying_the_learning_rate) for learning rate decay.
- **Global Average Pooling**: Instead of flattening and then having multiple affine layers, perform convolutions until your image gets small (7x7 or so) and then perform an average pooling operation to get to a 1x1 image picture (1, 1 , Filter#), which is then reshaped into a (Filter#) vector. This is used in [Google's Inception Network](https://arxiv.org/abs/1512.00567) (See Table 1 for their architecture).
- **Regularization**: Add l2 weight regularization, or perhaps use [Dropout as in the TensorFlow MNIST tutorial](https://www.tensorflow.org/get_started/mnist/pros)

### Tips for training
For each network architecture that you try, you should tune the learning rate and regularization strength. When doing this there are a couple important things to keep in mind:

- If the parameters are working well, you should see improvement within a few hundred iterations
- Remember the coarse-to-fine approach for hyperparameter tuning: start by testing a large range of hyperparameters for just a few training iterations to find the combinations of parameters that are working at all.
- Once you have found some sets of parameters that seem to work, search more finely around these parameters. You may need to train for more epochs.
- You should use the validation set for hyperparameter search, and we'll save the test set for evaluating your architecture on the best parameters as selected by the validation set.

### Going above and beyond
If you are feeling adventurous there are many other features you can implement to try and improve your performance. You are **not required** to implement any of these; however they would be good things to try for extra credit.

- Alternative update steps: For the assignment we implemented SGD+momentum, RMSprop, and Adam; you could try alternatives like AdaGrad or AdaDelta.
- Alternative activation functions such as leaky ReLU, parametric ReLU, ELU, or MaxOut.
- Model ensembles
- Data augmentation
- New Architectures
  - [ResNets](https://arxiv.org/abs/1512.03385) where the input from the previous layer is added to the output.
  - [DenseNets](https://arxiv.org/abs/1608.06993) where inputs into previous layers are concatenated together.
  - [This blog has an in-depth overview](https://chatbotslife.com/resnets-highwaynets-and-densenets-oh-my-9bb15918ee32)

If you do decide to implement something extra, clearly describe it in the "Extra Credit Description" cell below.

### What we expect
At the very least, you should be able to train a ConvNet that gets at **>= 70% accuracy on the validation set**. This is just a lower bound - if you are careful it should be possible to get accuracies much higher than that! Extra credit points will be awarded for particularly high-scoring models or unique approaches.

You should use the space below to experiment and train your network. The final cell in this notebook should contain the training and validation set accuracies for your final trained network.

Have fun and happy training!

In [4]:
params = {'lr':[5e-3,1e-3,5e-4],
         'decay':[0.99,0.9,0.8],
         'momentum':[0,0.1,0.2],
         'is_reg':[1,0]}

In [None]:
for p_lr in params['lr']:
    for p_decay in params['decay']:
        for p_momentum in params['momentum']:
            for p_is_reg in params['is_reg']:
                def my_model(X,y,is_training):
                    ba0   = tf.layers.batch_normalization(X,training=is_training)
                    # [conv-relu-conv-relu-pool]  out=14x14
                    conv1 = tf.layers.conv2d(ba0,128,kernel_size=[3,3],strides=(1,1),activation=tf.nn.relu)
                    ba1   = tf.layers.batch_normalization(conv1,training=is_training)
                    conv2 = tf.layers.conv2d(ba1,256,[3,3],activation=tf.nn.relu)
                    ba2   = tf.layers.batch_normalization(conv2,training=is_training)
                    pool1 = tf.layers.max_pooling2d(ba2,pool_size=[2,2],strides=2)
                    #[conv-relu-conv-relu-pool]  out=5x5
                    conv3 = tf.layers.conv2d(pool1,512,[3,3],activation=tf.nn.relu)
                    ba3   = tf.layers.batch_normalization(conv3,training=is_training)
                    conv4 = tf.layers.conv2d(ba3,256,[3,3],activation=tf.nn.relu)
                    ba4   = tf.layers.batch_normalization(conv4,training=is_training)
                    pool2 = tf.layers.max_pooling2d(ba4,pool_size=[2,2],strides=2)
                    #[dense-relu]x2 layer
                    pool2_flat = tf.reshape(pool2,[-1,5*5*256])
                    dense1 =tf.layers.dense(pool2_flat,units=512,activation=tf.nn.relu)
                    ba5 = tf.layers.batch_normalization(dense1,center=False,scale=False,training=is_training)
                    dropout1 = tf.layers.dropout(ba5,training=is_training)
                    dense2 = tf.layers.dense(dropout1,units=128,activation=tf.nn.relu)
                    ba6 = tf.layers.batch_normalization(dense2,center=False,scale=False,training=is_training)
                    dropout2 = tf.layers.dropout(ba6,training=is_training)
                    #logit out
                    logits = tf.layers.dense(dropout2,units=10)
                    return logits
                    pass

                for i in np.arange(1):
                    tf.reset_default_graph()

                    X = tf.placeholder(tf.float32, [None, 32, 32, 3])
                    y = tf.placeholderl(tf.int64, [None])
                    is_training = tf.placeholder(tf.bool)

                    y_out = my_model(X,y,is_training)
                    if p_is_reg:
                        total_loss= tf.losses.softmax_cross_entropy(tf.one_hot(y,10),y_out)+tf.losses.get_regularization_loss()
                    else:
                        total_loss= tf.losses.softmax_cross_entropy(tf.one_hot(y,10),y_out)#+tf.losses.get_regularization_loss()
                    mean_loss = tf.reduce_mean(total_loss)
                    optimizer = tf.train.RMSPropOptimizer(p_lr,decay=p_decay,momentum=p_momentum)

                    # batch normalization in tensorflow requires this extra dependency
                    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    with tf.control_dependencies(extra_update_ops):
                        train_step = optimizer.minimize(mean_loss)

                    pass

                    # Feel free to play with this cell
                    # This default code creates a session
                    # and trains your model for 10 epochs
                    # then prints the validation set accuracy
                    sess = tf.Session()
                    sess.run(tf.global_variables_initializer())

                    EARLY_STOPPING = 5
                    MAX_EPOCH = 50
                    eStopAcc = []
                    epoch=0
                    for e in np.arange(MAX_EPOCH):
                        train_loss,train_acc=run_model(sess,y_out,mean_loss,X_train,y_train,1,64,100,train_step)
                        val_loss,val_acc=run_model(sess,y_out,mean_loss,X_val,y_val,1,64)
                        eStopAcc.append(val_acc)
                        epoch=e
                        if early_stopping(eStopAcc,EARLY_STOPPING)==1 or e==MAX_EPOCH-1:
                            test_loss,test_acc = run_model(sess,y_out,mean_loss,X_test,y_test,1,64)
                            break
                    print("lr = {0:.4g} decay = {1:.2g} momentum = {2:.1g} is_reg = {3} stop@Epoch {4},  train_accuracy = {5:.3g} val_acc = {6:.3g} test_acc = {7:.3g}".format(p_lr,p_decay,p_momentum,p_is_reg,epoch+1,train_acc,val_acc,test_acc))


lr = 0.005 decay = 0.99 momentum = 0 is_reg = 1 stop@Epoch 18,  train_accuracy = 0.969 val_acc = 0.835 test_acc = 0.822
lr = 0.005 decay = 0.99 momentum = 0 is_reg = 0 stop@Epoch 28,  train_accuracy = 0.981 val_acc = 0.826 test_acc = 0.825
lr = 0.005 decay = 0.99 momentum = 0.1 is_reg = 1 stop@Epoch 17,  train_accuracy = 0.965 val_acc = 0.814 test_acc = 0.81
lr = 0.005 decay = 0.99 momentum = 0.1 is_reg = 0 stop@Epoch 17,  train_accuracy = 0.965 val_acc = 0.829 test_acc = 0.821
lr = 0.005 decay = 0.99 momentum = 0.2 is_reg = 1 stop@Epoch 12,  train_accuracy = 0.945 val_acc = 0.813 test_acc = 0.809
lr = 0.005 decay = 0.99 momentum = 0.2 is_reg = 0 stop@Epoch 15,  train_accuracy = 0.96 val_acc = 0.818 test_acc = 0.812
lr = 0.005 decay = 0.9 momentum = 0 is_reg = 1 stop@Epoch 11,  train_accuracy = 0.944 val_acc = 0.805 test_acc = 0.791
lr = 0.005 decay = 0.9 momentum = 0 is_reg = 0 stop@Epoch 15,  train_accuracy = 0.962 val_acc = 0.817 test_acc = 0.79
lr = 0.005 decay = 0.9 momentum = 0.1