Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [7]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import tensorflow.compat.v1 as tf
from six.moves import cPickle as pickle
from six.moves import range
from tqdm import tqdm_notebook as tqdm

In [8]:
print(tf.__version__)

2.0.0


In [9]:
pickle_file = '/home/chihao/nlp/NLP_assignments/assignment-09/notMNIST.pickle'

with open(pickle_file, 'rb') as f:
    save = pickle.load(f)
    train_dataset = save['train_dataset']
    train_labels = save['train_labels']
    valid_dataset = save['valid_dataset']
    valid_labels = save['valid_labels']
    test_dataset = save['test_dataset']
    test_labels = save['test_labels']
    del save  # hint to help gc free up memory
    print('Training set', train_dataset.shape, train_labels.shape)
    print('Validation set', valid_dataset.shape, valid_labels.shape)
    print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [10]:
image_size = 28
num_labels = 10
num_channels = 1 # gray scale

import numpy as np

def reformat(dataset, labels):
    dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
    labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
    return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28, 1) (200000, 10)
Validation set (10000, 28, 28, 1) (10000, 10)
Test set (10000, 28, 28, 1) (10000, 10)


In [5]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [11]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()
with graph.as_default():

  # Input data.
    tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
    layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1)) # patch_size -> kernel_size, num_channels-> last dim -> this dim
    layer1_biases = tf.Variable(tf.zeros([depth]))
    layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
    layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1))  # fully connected layer height = 16/2 16/2  所以是16//4 
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
    layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
    def model(data):
        conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer1_biases)
        conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
        hidden = tf.nn.relu(conv + layer2_biases)
        shape = hidden.get_shape().as_list()
        reshape = tf.reshape(hidden, [-1, shape[1] * shape[2] * shape[3]])
        hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
        return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
    logits = model(tf_train_dataset)
    loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
    
  # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
    test_prediction = tf.nn.softmax(model(tf_test_dataset))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



## same padding 
最低限度填充

new = 向上取整w/s

In [7]:

num_steps = 1001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if step % 50 == 0:
            print('Minibatch loss at step %d: %f' % (step, l))
            print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
            print('Validation accuracy: %.1f%%' % accuracy(
                valid_prediction.eval(), valid_labels))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 2.909925
Minibatch accuracy: 0.0%
Validation accuracy: 9.2%
Minibatch loss at step 50: 1.700878
Minibatch accuracy: 31.2%
Validation accuracy: 57.4%
Minibatch loss at step 100: 1.300187
Minibatch accuracy: 62.5%
Validation accuracy: 71.8%
Minibatch loss at step 150: 0.286697
Minibatch accuracy: 100.0%
Validation accuracy: 76.2%
Minibatch loss at step 200: 0.809725
Minibatch accuracy: 75.0%
Validation accuracy: 76.9%
Minibatch loss at step 250: 0.622038
Minibatch accuracy: 81.2%
Validation accuracy: 78.9%
Minibatch loss at step 300: 0.486390
Minibatch accuracy: 81.2%
Validation accuracy: 79.0%
Minibatch loss at step 350: 0.497803
Minibatch accuracy: 87.5%
Validation accuracy: 80.3%
Minibatch loss at step 400: 0.943425
Minibatch accuracy: 62.5%
Validation accuracy: 78.7%
Minibatch loss at step 450: 0.820247
Minibatch accuracy: 87.5%
Validation accuracy: 80.5%
Minibatch loss at step 500: 0.592091
Minibatch accuracy: 81.2%
Validation accuracy: 80.2%
Mi

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

In [39]:
class Model:
    def __init__(self, graph=None):
        if not graph:
            self.graph = tf.Graph()
        else:
            self.graph = graph
    
    def build_graph(self, image_size, channel, num_labels, kernel_size, depth, flat_units): # keep_prob should in the train process 
        with self.graph.as_default():
            initializer = tf.initializers.glorot_uniform()
            
            self.feature = tf.placeholder(tf.float32, shape=[None, image_size, image_size, channel])
            self.labels = tf.placeholder(tf.float32, shape = [None, num_labels])
            self.keep_prob = tf.placeholder(tf.float32, shape = ())
            
            filter1= tf.Variable(initializer(shape=[kernel_size,kernel_size, channel, depth]),name="f1")
            biases1 = tf.Variable(initializer(shape=[depth]),name = "b1")
            layer1 = tf.nn.relu(tf.nn.conv2d(self.feature,filter1,[1,2,2,1],"SAME")+biases1)
            dropout1 = tf.nn.dropout(layer1,rate = self.keep_prob)
            
            filter2 = tf.Variable(initializer(shape=[kernel_size,kernel_size,depth, depth]),name = "f2")
            biases2 = tf.Variable(initializer(shape=[depth]),name="b2")
            layer2 = tf.nn.relu(tf.nn.conv2d(dropout1,filter2,strides=[1,2,2,1],padding="SAME")+biases2)
                        

            dropout2 = tf.nn.dropout(layer2,rate= self.keep_prob)
            
            max_p = tf.nn.max_pool(dropout2,4,[1,2,2,1],padding="SAME")
            dropout3 = tf.nn.dropout(max_p,rate = self.keep_prob)
            transformed_size = image_size // 8 +1
            
            drop_shape = dropout3.shape
            reshaped = tf.reshape(dropout3, shape=[-1,drop_shape[1]*drop_shape[2]*drop_shape[3]])
            weight3 = tf.Variable(initializer(shape =[transformed_size*transformed_size*depth, flat_units]),name="w3")
            biases3 = tf.Variable(initializer(shape=[flat_units]))
            
            fully_connected= tf.nn.relu(tf.matmul(reshaped,weight3)+biases3)
            
            weight4 = tf.Variable(initializer(shape = [flat_units, num_labels]),name="w4")
            biases4 = tf.Variable(initializer(shape = [num_labels]),name="b4")
            
            output_logits = tf.matmul(fully_connected,weight4)+biases4
            
            
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(logits=output_logits,labels=self.labels)
            )
            self.optimizer = tf.train.AdadeltaOptimizer(0.1).minimize(self.loss)
            
            self.prediction = tf.nn.softmax(output_logits)
            
            self.accuracy = tf.metrics.accuracy(
                tf.argmax(self.labels,axis=1),
                tf.argmax(self.prediction,axis=1))
        
    def make_dataset(self, x, y , batch_size=50):
        with self.graph.as_default():
            origin_dict = (tf.constant(x),tf.constant(y))
            dataset = tf.data.Dataset.from_tensors(origin_dict)
            print("get_success")
            dataset = dataset.shuffle(buffer_size=100).batch(batch_size=batch_size).repeat()  # buffer_size 每次从多少个数据中抽样

            dataset_iter = dataset.make_initializable_iterator()
            next_iter = dataset_iter.get_next()
            return dataset_iter,next_iter
    
    def make_batch(self,x, y,batch_size=50):
        assert len(x) == len(y)
        point = 0
        while point<len(x):
            if point+batch_size <len(y):
                yield x[point:point+batch_size],y[point:point+batch_size]
            else:
                yield x[point:],y[point:]
            point += batch_size
            
    def train(self,data_dict, epoch_num, dropout_rate=0.3):

#         dd_iter,dev_iter = self.make_dataset(data_dict["dev_x"],data_dict["dev_y"])
#         tes_iter,test_iter = self.make_dataset(data_dict["test_x"],data_dict["test_y"])
        
        with tf.Session(graph=self.graph) as sess:
            tf.local_variables_initializer().run()
            tf.global_variables_initializer().run()
#             sess.run([td_iter.initializer,dd_iter.initializer,tes_iter.initializer])
            for i in tqdm(range(epoch_num)):
                for train_x, train_y in self.make_batch(data_dict["train_x"],data_dict["train_y"]):
                    if train_x is not None:
                        train_dict = {self.feature:train_x,
                          self.labels:train_y,
                          self.keep_prob:dropout_rate}
                        _, loss = sess.run([self.optimizer,self.loss],
                               feed_dict=train_dict)
                print("loss on train %s " %loss)
                if i %5 ==0:
                    dev_dict = {self.feature:data_dict["dev_x"],
                              self.labels:data_dict["dev_y"],
                              self.keep_prob:0}
                    dev_accuracy = sess.run(self.accuracy,feed_dict=dev_dict)
                    train_accuracy = sess.run(self.accuracy,feed_dict= train_dict)
                    print("accuracy on train dataset %s" %train_accuracy[1])
                    print("accuracy on dev dataset %s" %dev_accuracy[1])
            test_dict = {self.feature:data_dict["test_x"],
                              self.labels:data_dict['test_y'],
                              self.keep_prob:0}
            test_accuracy,test_prediction = sess.run([self.accuracy,self.prediction], feed_dict=test_dict)
            print("accuracy on test dataset %s" %test_accuracy[1])
            print("accuracy on test dataset another method %s" %accuracy(test_prediction,data_dict["test_y"]))
            
    
    def evaluate(self, X, Y):
        pass  

In [40]:
new_model = Model()
new_model.build_graph(image_size,num_channels,num_labels,patch_size,depth,num_hidden)
print("build graph successfully")
data_dict = {
    "train_x":train_dataset,
    "train_y":train_labels,
    "dev_x":valid_dataset,
    "dev_y":valid_labels,
    "test_x":test_dataset, 
    "test_y":test_labels
}
new_model.train(data_dict,20)

build graph successfully


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

loss on train 1.2102065 
accuracy on train dataset 0.74845773
accuracy on dev dataset 0.749
loss on train 0.9263614 
loss on train 0.8639719 
loss on train 0.6576946 
loss on train 0.6773569 
loss on train 0.9147366 
accuracy on train dataset 0.7984577
accuracy on dev dataset 0.79850376
loss on train 0.7471905 
loss on train 0.6894289 
loss on train 0.6331315 
loss on train 0.5946794 
loss on train 0.569268 
accuracy on train dataset 0.8197015
accuracy on dev dataset 0.8197342
loss on train 0.5360697 
loss on train 0.46748993 
loss on train 0.5129256 
loss on train 0.50084114 
loss on train 0.5006099 
accuracy on train dataset 0.83221394
accuracy on dev dataset 0.8321544
loss on train 0.35892925 
loss on train 0.47431728 
loss on train 0.44417244 
loss on train 0.42213944 

accuracy on test dataset 0.8523307
accuracy on test dataset another method 93.32


In [4]:
import tensorflow as tf2
import numpy as np
print(tf2.__version__)


2.0.0


---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [42]:
def make_batch(x, y,batch_size=50):
    assert len(x) == len(y)
    point = 0
    while point<len(x):
        if point+batch_size <len(y):
            yield x[point:point+batch_size],y[point:point+batch_size]
        else:
            yield x[point:],y[point:]
        point += batch_size

In [43]:
train_ds = tf2.data.Dataset.from_tensor_slices(
    (train_dataset,train_labels)).shuffle(1000).batch(32)

In [96]:
class MyModel(tf2.keras.models.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = tf2.keras.layers.Conv2D(6,[5,5],(1,1),"same",activation="tanh")
        self.dropout1 = tf2.keras.layers.Dropout(0.3)
        self.avgpool1 = tf2.keras.layers.AveragePooling2D((2,2),(2,2))
        self.conv2 = tf2.keras.layers.Conv2D(16,[5,5],(1,1),"valid",activation="tanh")
        self.dropout2 = tf2.keras.layers.Dropout(0.3)
        self.avgpool2 = tf2.keras.layers.AveragePooling2D((2,2),(2,2))
        self.conv3 = tf2.keras.layers.Conv2D(120,[5,5],(1,1),"valid",activation="tanh")
        self.dropout3 =tf2.keras.layers.Dropout(0.3)
        self.flatten = tf2.keras.layers.Flatten()
        self.fullyconnected = tf2.keras.layers.Dense(84,activation="tanh")
        self.dropout4 =tf2.keras.layers.Dropout(0.3)
        self.outputs = tf2.keras.layers.Dense(10,activation="softmax")

        
        self.optimizer = tf2.keras.optimizers.Adadelta()
        self.accuracy = tf2.keras.metrics.SparseCategoricalAccuracy()
        self.loss = tf2.keras.losses.SparseCategoricalCrossentropy()

    @tf2.function 
    def call(self,x):
        x = self.conv1(x)
        x =self.dropout1(x)
        x = self.avgpool1(x)
        x = self.conv2(x)
        x = self.dropout2(x)
        x = self.avgpool2(x)
        x = self.conv3(x)
        x = self.dropout3(x)
        x = self.flatten(x)
        x = self.fullyconnected(x)
        x =self.dropout4(x)
        return self.outputs(x)
    
#     @tf2.function
#     def update_parameters(self,loss,tape):
#         gradients = tape.gradient(loss, self.trainable_variables)
#         self.optimizer.apply_gradients(zip(gradients,self.trainable_variables))

    def train_step(self, x, y):    
        with tf.GradientTape() as tape:
            y_hat = self(x)
            loss = self.loss(tf2.argmax(y,axis=1), y_hat)
#         self.update_parameters(loss,tape)
            gradients = tape.gradient(loss, self.trainable_variables)
    
            self.optimizer.apply_gradients(zip(gradients,self.trainable_variables))
        
        return loss
    
    def make_dataset(self, x, y , batch_size=32):
        return tf2.data.Dataset.from_tensor_slices(
            (x,y)
        ).shuffle(x.shape[0]//2).batch(batch_size)
     
        
    def evaluate(self,x,y):
        return self.accuracy(tf2.argmax(y,axis=1),self(x))
    
    
    def train(self,data_dict, epoch_num):
        
        self.train_loss = tf2.metrics.Mean()
        self.train_acc = tf2.metrics.Mean()
        
        test_x, test_y = data_dict["test_x"],data_dict["test_y"]
        dev_x, dev_y = data_dict["dev_x"],data_dict['dev_y']
        train_x,train_y = data_dict["train_x"], data_dict["train_y"]
        train_data = self.make_dataset(train_x,train_y)
        for epoch in tqdm(range(epoch_num)):
            for x_batch,y_batch in train_data:
                batch_loss = self.train_step(x_batch,y_batch)
                self.train_loss(batch_loss)
#                 self.train_acc(batch_acc)
            if epoch % 1 == 0:
                print("train_loss %s" %self.train_loss.result())
#                 print("train_acc %s" %self.train_acc.result())
            if epoch % 5 == 0:
                print("dev_acc %s" %self.evaluate(dev_x,dev_y))
            self.train_loss.reset_states()
            self.train_acc.reset_states()
        print("test_acc %s" %self.evaluate(test_x,test_y))
        
                      
                
                

In [97]:
my_model = MyModel()
data_dict = {
    "train_x":train_dataset,
    "train_y":train_labels,
    "dev_x":valid_dataset,
    "dev_y":valid_labels,
    "test_x":test_dataset, 
    "test_y":test_labels
}

# my_model.evaluate(data_dict["dev_x"],data_dict["dev_y"])
my_model.train(data_dict,10)


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

train_loss tf.Tensor(1.9568226, shape=(), dtype=float32)
dev_acc tf.Tensor(0.6344, shape=(), dtype=float32)
train_loss tf.Tensor(1.3328735, shape=(), dtype=float32)
train_loss tf.Tensor(1.0834665, shape=(), dtype=float32)
train_loss tf.Tensor(0.9727717, shape=(), dtype=float32)
train_loss tf.Tensor(0.91095674, shape=(), dtype=float32)
train_loss tf.Tensor(0.8711842, shape=(), dtype=float32)
dev_acc tf.Tensor(0.71075, shape=(), dtype=float32)
train_loss tf.Tensor(0.84279865, shape=(), dtype=float32)
train_loss tf.Tensor(0.8210653, shape=(), dtype=float32)
train_loss tf.Tensor(0.80361915, shape=(), dtype=float32)
train_loss tf.Tensor(0.78911966, shape=(), dtype=float32)

test_acc tf.Tensor(0.7636667, shape=(), dtype=float32)
