## Columbia University
### ECBM E4040 Neural Networks and Deep Learning. Fall 2019.

# Train dct

### In this file, we are going to TRY a new method DCT (which can compress the data largely). And we want to see how much accuracy do we sacrifice to save the space and training time.

In [1]:
import os
from datetime import datetime
import time
import numpy as np
import h5py
import random
from PIL import Image
import tensorflow as tf
import json
from model import Model

### In this part, we define the evaluator to evaluate the accuracy. 
1. First we extra the image information in validation set. <br>
2. Second reshape the image into needed shape and build batch for validation. <br>
3. Then we define the functions that built batch, put them go through the model, calculate the predictions and accuracy and present validation accuracy. 
<br><br>PS: Since we use the image that has been preprocessed by DCT, the shape is kind of different from the original one.

In [2]:
def write_summary(path_to_eval_log_dir):
    summary_writer = tf.summary.FileWriter(path_to_eval_log_dir)
    return summary_writer

def evaluate(writer, path_to_checkpoint, path_to_tfrecords_file, num_examples, global_step):
    batch_size = 128
    num_batches = num_examples // batch_size
    needs_include_length = False

    with tf.Graph().as_default():
        filename_queue_val = tf.train.string_input_producer([path_to_tfrecords_file], num_epochs=None)
        #image, length, digits = read_and_decode(filename_queue)

        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue_val)
        features_val = tf.parse_single_example(
                serialized_example,
                features={
                    'image': tf.FixedLenFeature([], tf.string),
                    'length': tf.FixedLenFeature([], tf.int64),
                    'digits': tf.FixedLenFeature([5], tf.int64)
                })
        #read validation data of image in tfrecords we built using DCT
        image_val = tf.decode_raw(features_val['image'], tf.uint8)
        #extract validation image data and reshape it into needed shap
        image_val = tf.image.convert_image_dtype(image_val, dtype=tf.float32)
        image_val = tf.multiply(tf.subtract(image_val, 0.5), 2)
        image_val = tf.reshape(image_val, [64, 64, 1])
        image_val = tf.random_crop(image_val, [54, 54, 1])
        #extract the length of digits and the extra digits in picture in validation set
        length_val = tf.cast(features_val['length'], tf.int32)
        digits_val = tf.cast(features_val['digits'], tf.int32)
    
    
    
        min_queue_examples_val = int(0.4 * num_examples)
        #build batch for validation
        image_batch_val, length_batch_val, digits_batch_val = tf.train.batch([image_val, length_val, digits_val],
                                                                     batch_size=batch_size,
                                                                     num_threads=2,
                                                                     capacity=min_queue_examples_val + 3 * batch_size)        
        length_logits_val, digits_logits_val = Model.inference(image_batch_val, drop_rate=0.0)
        length_predictions_val = tf.argmax(length_logits_val, axis=1)
        digits_predictions_val = tf.argmax(digits_logits_val, axis=2)
        #use the built batch to predict the result. Since it use softmax in the model, we need to find one whose probability is the highest

        labels_val = digits_batch_val
        predictions_val = digits_predictions_val

        labels_string_val = tf.reduce_join(tf.as_string(labels_val), axis=1)
        predictions_string_val = tf.reduce_join(tf.as_string(predictions_val), axis=1)

        accuracy_vali, update_accuracy_vali = tf.metrics.accuracy(
        labels=labels_string_val,
        predictions=predictions_string_val
            )
        #calculate the accuracy of predictions and true labels of validation set
        tf.summary.image('image', image_batch_val)
        tf.summary.scalar('accuracy', accuracy_vali)
        tf.summary.histogram('variables',
                                 tf.concat([tf.reshape(var, [-1]) for var in tf.trainable_variables()], axis=0))
        summary = tf.summary.merge_all()

        with tf.Session() as sess:
            sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            restorer = tf.train.Saver()
            restorer.restore(sess, path_to_checkpoint)

            for _ in range(num_batches):
                sess.run(update_accuracy_vali)

            accuracy_val, summary_val = sess.run([accuracy_vali, summary])
            writer.add_summary(summary_val, global_step=global_step)

            coord.request_stop()
            coord.join(threads)

    return accuracy_val



### In this part, we define the main train function. 
1. First extrat the image information from tfrecords and build batch. <br>
2. Then we go through model, get the prediction and calculate train accuracy. <br>
3. We use this function to train small number of times in the same dataset on different hyperparameter settings, or just train the best model after we have selected the best model. 
<br><br>PS: Since it has to take long time, we did not demonstrate early stop here, but we have tried that before.

In [3]:
#Build the training process

def train(path_to_train_tfrecords_file, num_train_examples, path_to_val_tfrecords_file, num_val_examples,
           path_to_train_log_dir, training_options, history_file_path):
    batch_size = training_options['batch_size']
    num_steps_to_show_loss = 10
    num_steps_to_check = 100  #set the step to show loss and validation accuracy
    Start_time = time.time()
    with tf.Graph().as_default():
        filename_queue_train = tf.train.string_input_producer([path_to_train_tfrecords_file], num_epochs=None)
        #image, length, digits = read_and_decode(filename_queue)

        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue_train)
        features_train = tf.parse_single_example(
                serialized_example,
                features={
                    'image': tf.FixedLenFeature([], tf.string),
                    'length': tf.FixedLenFeature([], tf.int64),
                    'digits': tf.FixedLenFeature([5], tf.int64)
                })
        #read training data of image in tfrecords we built using DCT
        image_train = tf.decode_raw(features_train['image'], tf.uint8)
        #extract image data and reshape it into needed shape for training
        image_train = tf.image.convert_image_dtype(image_train, dtype=tf.float32)
        image_train = tf.multiply(tf.subtract(image_train, 0.5), 2)
        image_train = tf.reshape(image_train, [64, 64, 1])
        image_train = tf.random_crop(image_train, [54, 54, 1])
        #extract the length of digits and the extra digits in picture in training set
        length_train = tf.cast(features_train['length'], tf.int32)
        digits_train = tf.cast(features_train['digits'], tf.int32)
    
    
    
        min_queue_examples_train = int(0.4 * num_train_examples)
        #build batch for training
        image_batch_train, length_batch_train, digits_batch_train = tf.train.batch([image_train, length_train, digits_train],
                                                                     batch_size=batch_size,
                                                                     num_threads=2,
                                                                     capacity=min_queue_examples_train + 3 * batch_size) 
        length_logtis_train, digits_logits_train = Model.inference(image_batch_train, drop_rate=0.2)
        #Use the built batch to predict
        loss = Model.loss(length_logtis_train, digits_logits_train, length_batch_train, digits_batch_train)
        #Calculate the loss of prediction of built batch
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(training_options['learning_rate'], global_step=global_step,
                                                   decay_steps=training_options['decay_steps'], decay_rate=training_options['decay_rate'], staircase=True)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        train_op = optimizer.minimize(loss, global_step=global_step)
        #set up training parameter and prepare summary
        tf.summary.image('image', image_batch_train)
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('learning_rate', learning_rate)
        summary = tf.summary.merge_all()

        with tf.Session() as sess:
            summary_writer = tf.summary.FileWriter(path_to_train_log_dir, sess.graph)
            evaluator = write_summary(os.path.join(path_to_train_log_dir, 'eval/val'))
            
            sess.run(tf.global_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            saver = tf.train.Saver()

            print ('Start training')
            best_accuracy = 0.0   #define the initial best accuracy for latter replacement
            duration = 0.0
            epoch = 100000
            k=0
            accuracy_history=[]
            for i in range(100000):   #define the training times
                k+=1
                start_time = time.time()
                _, loss_val, summary_val, global_step_val, learning_rate_val = sess.run([train_op, loss, summary, global_step, learning_rate])
                duration += time.time() - start_time

                if global_step_val % num_steps_to_show_loss == 0:
                    examples_per_sec = batch_size * num_steps_to_show_loss / duration
                    duration = 0.0
                    print ('%s: step %d, loss = %f ' % (
                        datetime.now(), global_step_val, loss_val))

                if global_step_val % num_steps_to_check != 0:
                    continue

                summary_writer.add_summary(summary_val, global_step=global_step_val)


                path_to_latest_checkpoint_file = saver.save(sess, os.path.join(path_to_train_log_dir, 'latest.ckpt'))
                accuracy = evaluate(evaluator, path_to_latest_checkpoint_file, path_to_val_tfrecords_file,
                                              num_val_examples,
                                              global_step_val)
                print ('Validation accuracy is= %f, best accuracy %f' % (accuracy, best_accuracy))
                if k%10==0:
                    accuracy_history.append(accuracy)
                    #append accuracy history list
                if accuracy > best_accuracy:
                    path_to_checkpoint_file = saver.save(sess, os.path.join(path_to_train_log_dir, 'model.ckpt'),
                                                         global_step=global_step_val)
                    print ('Save file to: %s' % path_to_checkpoint_file)
                    best_accuracy = accuracy

                if k==100000:
                    break

            coord.request_stop()
            coord.join(threads)
            with open(history_file_path, 'w', encoding='utf-8') as f:
                f.write('times, acuuracy\n')
                for i,d in enumerate(accuracy_history):
                    f.write(str(i) + ',' + str(d) + '\n')
            end_time = time.time()
            print(end_time-Start_time)
            print ('Training progess is finished')

### In next few parts, try and trial
we train the same dataset in different hyperparameter settings in small number of training times to choose the best model we can get in limited time. 
#### the learning rate is 0.01 and 0.005, the batch size is 16 and 32.

In [5]:
#Train the model 
#define the folder path
train_tfrecords_file = 'data/train1_dct.tfrecords'
val_tfrecords_file = 'data/val1_dct.tfrecords'
tfrecords_meta_file = 'data/meta.json'
log_dir = 'logs_hyper/train32_5e3_DCT'
history_file_path = 'data/history_hyper_DCT_32_5e3.csv' #history file for later analysis
opt = {
    'batch_size': 32, #batch_size=32
    'learning_rate': 5e-3, #learning_rate = 5e-3
    'decay_steps': 10000,
    'decay_rate': 0.9
    }
with open(tfrecords_meta_file, 'r') as f:
    content = json.load(f)
    num_train_examples = content['num_examples']['train']
    num_val_examples = content['num_examples']['val']
    num_test_examples = content['num_examples']['test']
    
    
#train the model     
train(train_tfrecords_file, num_train_examples,
           val_tfrecords_file, num_val_examples,
           log_dir, 
           opt, history_file_path)

Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.TFRecor

In [6]:
#Train the model 
#define the folder path
train_tfrecords_file = 'data/train1_dct.tfrecords'
val_tfrecords_file = 'data/val1_dct.tfrecords'
tfrecords_meta_file = 'data/meta.json'
log_dir = 'logs_hyper/train16_5e3_DCT'
history_file_path = 'data/history_hyper_DCT_16_5e3.csv' #history file
opt = {
    'batch_size': 16, #batch_size=16
    'learning_rate': 5e-3, #learning_rate = 5e-3
    'decay_steps': 10000,
    'decay_rate': 0.9
    }
with open(tfrecords_meta_file, 'r') as f:
    content = json.load(f)
    num_train_examples = content['num_examples']['train']
    num_val_examples = content['num_examples']['val']
    num_test_examples = content['num_examples']['test']
    
    
#train the model     
train(train_tfrecords_file, num_train_examples,
           val_tfrecords_file, num_val_examples,
           log_dir, 
           opt, history_file_path)

Start training
2019-12-13 07:14:59.074646: step 10, loss = 13.473948 
2019-12-13 07:14:59.646773: step 20, loss = 9.512476 
2019-12-13 07:15:00.284390: step 30, loss = 7.954482 
2019-12-13 07:15:00.921274: step 40, loss = 8.714312 
2019-12-13 07:15:01.556106: step 50, loss = 7.729713 
2019-12-13 07:15:02.196643: step 60, loss = 7.596278 
2019-12-13 07:15:02.832689: step 70, loss = 6.511542 
2019-12-13 07:15:03.473407: step 80, loss = 6.764091 
2019-12-13 07:15:04.100504: step 90, loss = 7.462478 
2019-12-13 07:15:04.737865: step 100, loss = 6.782179 
INFO:tensorflow:Restoring parameters from logs_hyper/train16_5e3_DCT/latest.ckpt
Validation accuracy is= 0.021034, best accuracy 0.000000
Save file to: logs_hyper/train16_5e3_DCT/model.ckpt-100
2019-12-13 07:15:15.643290: step 110, loss = 7.045624 
2019-12-13 07:15:16.234651: step 120, loss = 7.790131 
2019-12-13 07:15:16.810633: step 130, loss = 8.152784 
2019-12-13 07:15:17.387798: step 140, loss = 7.472957 
2019-12-13 07:15:17.952720: s

In [7]:
#Train the model 
#define the folder path
train_tfrecords_file = 'data/train1_dct.tfrecords'
val_tfrecords_file = 'data/val1_dct.tfrecords'
tfrecords_meta_file = 'data/meta.json'
log_dir = 'logs_hyper/train32_1e2_DCT'
history_file_path = 'data/history_hyper_DCT_32_1e2.csv' #history file
opt = {
    'batch_size': 32, #batch_size=32
    'learning_rate': 1e-2, #learning_rate = 1e-2
    'decay_steps': 10000,
    'decay_rate': 0.9
    }
with open(tfrecords_meta_file, 'r') as f:
    content = json.load(f)
    num_train_examples = content['num_examples']['train']
    num_val_examples = content['num_examples']['val']
    num_test_examples = content['num_examples']['test']
    
    
#train the model     
train(train_tfrecords_file, num_train_examples,
           val_tfrecords_file, num_val_examples,
           log_dir, 
           opt, history_file_path)

Start training
2019-12-13 08:24:52.148818: step 10, loss = 11.059249 
2019-12-13 08:24:53.069054: step 20, loss = 8.892923 
2019-12-13 08:24:53.992189: step 30, loss = 7.362084 
2019-12-13 08:24:54.906037: step 40, loss = 6.704824 
2019-12-13 08:24:55.817677: step 50, loss = 7.154491 
2019-12-13 08:24:56.726217: step 60, loss = 8.021320 
2019-12-13 08:24:57.637299: step 70, loss = 6.951486 
2019-12-13 08:24:58.551263: step 80, loss = 6.644588 
2019-12-13 08:24:59.465538: step 90, loss = 7.339825 
2019-12-13 08:25:00.381012: step 100, loss = 7.385385 
INFO:tensorflow:Restoring parameters from logs_hyper/train32_1e2_DCT/latest.ckpt
Validation accuracy is= 0.021935, best accuracy 0.000000
Save file to: logs_hyper/train32_1e2_DCT/model.ckpt-100
2019-12-13 08:25:11.902781: step 110, loss = 6.914909 
2019-12-13 08:25:12.624725: step 120, loss = 8.120162 
2019-12-13 08:25:13.360496: step 130, loss = 7.119632 
2019-12-13 08:25:14.084296: step 140, loss = 7.579732 
2019-12-13 08:25:14.826667: s

In [5]:
#Train the model 
#define the folder path
train_tfrecords_file = 'data/train1_dct.tfrecords'
val_tfrecords_file = 'data/val1_dct.tfrecords'
tfrecords_meta_file = 'data/meta.json'
log_dir = 'logs_hyper/train16_1e2_DCT'
history_file_path = 'data/history_hyper_DCT_16_1e2.csv' #history file
opt = {
    'batch_size': 16, #batch_size=16
    'learning_rate': 1e-2, #learning_rate = 1e-2
    'decay_steps': 10000,
    'decay_rate': 0.9
    }
with open(tfrecords_meta_file, 'r') as f:
    content = json.load(f)
    num_train_examples = content['num_examples']['train']
    num_val_examples = content['num_examples']['val']
    num_test_examples = content['num_examples']['test']
    
    
#train the model     
train(train_tfrecords_file, num_train_examples,
           val_tfrecords_file, num_val_examples,
           log_dir, 
           opt, history_file_path)

Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(string_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.TFRecor

In [7]:
#Train the model 
#define the folder path
train_tfrecords_file = 'data/train1_dct.tfrecords'
val_tfrecords_file = 'data/val1_dct.tfrecords'
tfrecords_meta_file = 'data/meta.json'
log_dir = 'logs_hyper/train32_1e2_DCT_BEST'
history_file_path = 'data/history_hyper_DCT_32_1e2_BEST.csv' #history file
opt = {
    'batch_size': 32, #batch_size=32
    'learning_rate': 1e-2, #learning_rate = 1e-2
    'decay_steps': 10000,
    'decay_rate': 0.9
    }
with open(tfrecords_meta_file, 'r') as f:
    content = json.load(f)
    num_train_examples = content['num_examples']['train']
    num_val_examples = content['num_examples']['val']
    num_test_examples = content['num_examples']['test']
    
    
#train the model     
train(train_tfrecords_file, num_train_examples,
           val_tfrecords_file, num_val_examples,
           log_dir, 
           opt, history_file_path)

Start training
2019-12-13 22:17:50.504859: step 10, loss = 12.002223 
2019-12-13 22:17:51.443129: step 20, loss = 12.392581 
2019-12-13 22:17:52.378854: step 30, loss = 8.400614 
2019-12-13 22:17:53.318791: step 40, loss = 6.871929 
2019-12-13 22:17:54.256651: step 50, loss = 6.963082 
2019-12-13 22:17:55.187663: step 60, loss = 8.132598 
2019-12-13 22:17:56.124670: step 70, loss = 6.903077 
2019-12-13 22:17:57.064381: step 80, loss = 6.644784 
2019-12-13 22:17:57.998415: step 90, loss = 7.348834 
2019-12-13 22:17:58.932458: step 100, loss = 7.454751 
INFO:tensorflow:Restoring parameters from logs_hyper/train32_1e2_DCT_BEST/latest.ckpt
Validation accuracy is= 0.018930, best accuracy 0.000000
Save file to: logs_hyper/train32_1e2_DCT_BEST/model.ckpt-100
2019-12-13 22:18:11.372055: step 110, loss = 6.905195 
2019-12-13 22:18:12.171631: step 120, loss = 8.209729 
2019-12-13 22:18:12.950487: step 130, loss = 7.149606 
2019-12-13 22:18:13.733021: step 140, loss = 7.671917 
2019-12-13 22:18:1