In [None]:
# https://nthu-datalab.github.io/ml/labs/13-2_Image-Caption/13-2_Image-Caption.html

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import tensorflow as tf
from tensorflow.keras import layers

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from tqdm import tqdm

from tensorflow import keras

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        tf.config.experimental.set_memory_growth(gpus[0], True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

2 Physical GPUs, 1 Logical GPUs


2021-12-16 17:08:30.712871: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-16 17:08:30.713213: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-16 17:08:30.894900: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-16 17:08:30.895233: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-16 17:08:30.895531: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from S

In [4]:
!nvidia-smi

Thu Dec 16 17:08:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   45C    P2    45W / 250W |    248MiB /  8119MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:02:00.0 Off |                  N/A |
|  0%   45C    P8    11W / 250W |      8MiB /  8119MiB |      0%      Default |
|       

In [5]:
IMAGE_DIR = './words_captcha/'
annotation_file = './words_captcha/spec_train_val.txt'

# read spec_train_val.txt file
with open(annotation_file, 'r') as f:
    lines = f.readlines()
f.close()

# initial variable
train_img_name = []
val_img_name = []
train_annotation = []
val_annotation = []
num = 0

# first 100000 data are training set, others are validation (20000)
for line in lines:
    line = line.strip('\n')
    line = line.split(' ')
    if num<100000:
        train_img_name.append(line[0])
        train_annotation.append(line[1])
    else:
        val_img_name.append(line[0])
        val_annotation.append(line[1])
    num+=1

In [6]:
# print len and first data of both train & val data, to check if data is split successfully
print(len(train_img_name), len(val_img_name))

100000 20000


In [7]:
# Find the maximum length of any caption in our dataset
def cal_max_length(annotations):
    max_len = 0
    for annotation in annotations:
        if len(annotation) > max_len:
            max_len = len(annotation)
    return max_len

In [8]:
character_to_idx = {}
idx_to_character = {}
character_to_idx['<pad>'] = 0
idx_to_character[0] = '<pad>'
index = 1

for annotation in (train_annotation):
    for character in annotation:
        if character not in character_to_idx:
            character_to_idx[character] = index
            idx_to_character[index] = character
            index+=1

In [9]:
character_to_idx['<start>'] = 27
idx_to_character[27] = '<start>'

character_to_idx['<end>'] = 28
idx_to_character[28] = '<end>'

In [10]:
train_annotation_idx = []
val_annotation_idx = []

# Find out the max_length
max_len = cal_max_length(train_annotation) + 2
max_len

for annotation in train_annotation:
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(character_to_idx[character])
    annotation_idx.append(28)
    while len(annotation_idx) < max_len:
        annotation_idx.append(0)
    train_annotation_idx.append(annotation_idx)
    
for annotation in val_annotation:
    annotation_idx = [27]
    for character in annotation:
        annotation_idx.append(character_to_idx[character])
    annotation_idx.append(28)
    while len(annotation_idx) < max_len:
        annotation_idx.append(0)
    val_annotation_idx.append(annotation_idx)

In [11]:
# Feel free to change these parameters according to your system's configuration
BATCH_SIZE = 8
BUFFER_SIZE = 5000
embedding_dim = 256
units = 512
# vocab_size = len(tokenizer.word_index) + 1
# num_steps = len(img_name_train) // BATCH_SIZE
vocab_size = len(character_to_idx)
num_steps = len(train_img_name) // BATCH_SIZE
val_num_steps = len(val_img_name) // BATCH_SIZE

# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

LEARNING_RATE = 5e-5

In [12]:
def load_image(image_name, annotation):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    # resize height to 300, width to 160
    img = tf.image.resize(img, (300, 160))
    img = img/255 - 1.
    return img, annotation

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_img_name,train_annotation_idx))
val_dataset = tf.data.Dataset.from_tensor_slices((val_img_name,val_annotation_idx))

train_dataset = train_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(200)

val_dataset = val_dataset.shuffle(BUFFER_SIZE)
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.prefetch(200)

In [14]:
train_dataset

<PrefetchDataset shapes: ((None, 300, 160, 3), (None, 7)), types: (tf.float32, tf.int32)>

In [15]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:

class conv_leaky_relu(tf.keras.layers.Layer):
    def __init__(self, filters, size, stride):
        super(conv_leaky_relu, self).__init__()
        self.conv = tf.keras.layers.Conv2D(filters, size, stride, padding="same",
                      kernel_initializer=tf.keras.initializers.TruncatedNormal())
        self.batchnorm = tf.keras.layers.BatchNormalization()
        self.lkrelu = tf.keras.layers.LeakyReLU(0.1)

    def call(self, inputs, training):
        x = self.conv(inputs)
        x = self.batchnorm(x,training = training)
        x = self.lkrelu(x)
        return x

In [17]:
class Feature_Extracter(tf.keras.Model):

    def __init__(self):
        super(Feature_Extracter, self).__init__()
        self.cr1 = conv_leaky_relu(64,3,1)
        self.cr2 = conv_leaky_relu(64,3,1)
        self.max_pooling1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr3 = conv_leaky_relu(128,3,1)
        self.cr4 = conv_leaky_relu(128,3,1)
        self.max_pooling2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr5 = conv_leaky_relu(256,3,1)
        self.cr6 = conv_leaky_relu(256,3,1)
        self.cr7 = conv_leaky_relu(256,3,1)
        self.cr8 = conv_leaky_relu(256,3,1)
        self.max_pooling3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr9 = conv_leaky_relu(512,3,1)
        self.cr10 = conv_leaky_relu(512,3,1)
        self.cr11 = conv_leaky_relu(512,3,1)
        self.cr12 = conv_leaky_relu(512,3,1)
        self.max_pooling4 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr13 = conv_leaky_relu(512,3,1)
        self.cr14 = conv_leaky_relu(512,3,1)
        self.cr15 = conv_leaky_relu(512,3,1)
        self.cr16 = conv_leaky_relu(512,3,1)
        self.max_pooling5 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))
        self.cr17 = conv_leaky_relu(1024,3,1)
        self.cr18 = conv_leaky_relu(1024,3,1)
        self.cr19 = conv_leaky_relu(1024,3,1)

    def call(self, inputs, training):
        x = self.cr1(inputs,training)
        x = self.cr2(x,training)
        x = self.max_pooling1(x)
        x = self.cr3(x,training)
        x = self.cr4(x,training)
        x = self.max_pooling2(x)
        x = self.cr5(x,training)
        x = self.cr6(x,training)
        x = self.cr7(x,training)
        x = self.cr8(x,training)
        x = self.max_pooling3(x)
        x = self.cr9(x,training)
        x = self.cr10(x,training)
        x = self.cr11(x,training)
        x = self.cr12(x,training)
        x = self.max_pooling4(x)
        x = self.cr13(x,training)
        x = self.cr14(x,training)
        x = self.cr15(x,training)
        x = self.cr16(x,training)
        x = self.max_pooling5(x)
        x = self.cr17(x,training)
        x = self.cr18(x,training)
        x = self.cr19(x,training)
        return x

In [18]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [19]:
feature_extracter = Feature_Extracter()
feature_extracter.build((None,300,160,3))
feature_extracter.summary()

Model: "feature__extracter"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv_leaky_relu (conv_leaky  multiple                 2048      
 _relu)                                                          
                                                                 
 conv_leaky_relu_1 (conv_lea  multiple                 37184     
 ky_relu)                                                        
                                                                 
 max_pooling2d (MaxPooling2D  multiple                 0         
 )                                                               
                                                                 
 conv_leaky_relu_2 (conv_lea  multiple                 74368     
 ky_relu)                                                        
                                                                 
 conv_leaky_relu_3 (conv_lea  multiple          

In [20]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [21]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [22]:
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

In [23]:
checkpoint_path = "./checkpoints/train_vgg19"
ckpt = tf.train.Checkpoint(feature_extracter=feature_extracter,
                           encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [24]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
    start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

In [25]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []

In [26]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([character_to_idx['<start>']] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:
        features = feature_extracter(img_tensor,True)
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
        features = encoder(features)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)

            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = feature_extracter.trainable_variables + encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return loss, total_loss

In [27]:
EPOCHS = 10

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(train_dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss
        print ('Epoch {} {}/{} Train Loss {:.6f}'.format(epoch + 1,batch+1,num_steps,total_loss/(batch+1)),end='\r')
    print('')
    equal_num = 0
    total_val_loss = 0
    for (batch, (img_tensor, target)) in enumerate(val_dataset):
        val_loss = 0
        hidden = decoder.reset_state(batch_size=target.shape[0])
        dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
        features = feature_extracter(img_tensor,False)
        features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
        features = encoder(features)
        result = np.full((BATCH_SIZE, 1), 27)
        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            predicted_id = tf.argmax(predictions,axis=1).numpy()
            val_loss += loss_function(target[:, i], predictions)
            result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
            dec_input = tf.expand_dims(predicted_id, 1)
        target_array = target.numpy()
        total_val_loss += (val_loss / int(target.shape[1]))
        for i in range(BATCH_SIZE):
            for j in range(max_len):
                if result[i][j] == 28 and target_array[i][j] == 28:
                    if (result[i][1:j] == target_array[i][1:j]).all():
                        equal_num+=1
                    break
        print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')
    
    print('')

    loss_plot.append(total_loss / num_steps)

    ckpt_manager.save()
    output_string = 'Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f} Validation Loss {:.6f}\n'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.,total_val_loss/val_num_steps)
    with open('./lab13-2_v4.log','a') as f:
        f.write(output_string)
    f.close()
    print ('Epoch {} Train Loss {:.6f} Validation Accuracy {:.6f}'.format(epoch + 1,
                                                             total_loss/num_steps,float(equal_num)/20000.))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [28]:
ckpt.restore('./checkpoints/train_vgg19/ckpt-10')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f06d2d6bb20>

In [29]:
equal_num = 0
total_val_loss = 0
for (batch, (img_tensor, target)) in enumerate(val_dataset):
    val_loss = 0
    hidden = decoder.reset_state(batch_size=target.shape[0])
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, target.shape[1]):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        val_loss += loss_function(target[:, i], predictions)
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    target_array = target.numpy()
    total_val_loss += (val_loss / int(target.shape[1]))
    for i in range(BATCH_SIZE):
        for j in range(max_len):
            if result[i][j] == 28 and target_array[i][j] == 28:
                if (result[i][1:j] == target_array[i][1:j]).all():
                    equal_num+=1
                break
    print ('Validation Accuracy {:.6f}, Validation Loss {:.6f}'.format(float(equal_num)/((batch+1)*BATCH_SIZE),total_val_loss/(batch+1)),end='\r')

2021-12-16 17:08:45.457081: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8202
2021-12-16 17:08:46.894762: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Validation Accuracy 0.989300, Validation Loss 0.009505

In [30]:
test_img_name = []

for i in range(120000,140000):
    test_img_name.append('a'+str(i))

In [31]:
def load_test_image(image_name):
    img = tf.io.read_file(IMAGE_DIR + image_name + '.png')
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (300, 160))
    img = img/255 - 1.
    return img

In [32]:
test_dataset = tf.data.Dataset.from_tensor_slices(test_img_name)
test_dataset = test_dataset.map(load_test_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.prefetch(200)

In [33]:
test_dataset

<PrefetchDataset shapes: (None, 300, 160, 3), types: tf.float32>

In [None]:
num=0
for batch, img_tensor in enumerate(test_dataset):
    hidden = decoder.reset_state(batch_size=BATCH_SIZE)
    dec_input = tf.expand_dims([character_to_idx['<start>']]*BATCH_SIZE, 1)
    features = feature_extracter(img_tensor,False)
    features = tf.reshape(features,(features.shape[0], -1, features.shape[3]))
    features = encoder(features)
    result = np.full((BATCH_SIZE, 1), 27)
    for i in range(1, max_len):
        # passing the features through the decoder
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions,axis=1).numpy()
        result = np.concatenate((result, predicted_id.reshape((BATCH_SIZE,1))), axis=1)
        dec_input = tf.expand_dims(predicted_id, 1)
    for i in range(BATCH_SIZE):
        output_str = ''
        num = num+1
        hit = False
        for j in range(1,max_len):
            if result[i][j] == 28:
                hit = True
                break
            else:
                output_str = output_str + idx_to_character[result[i][j]]

        with open('./Lab13-2_110062401.txt','a') as f:
            f.write('a' + str(119999 + num) + ' ' + output_str+'\n')
        f.close()

## Brief Report

- 一開始使用了idx_to_charactor和charactor_to_idx來做annotation， 長度是26個字母
- 這裡中間直接使用了vgg19來做feature extractor， weight都是重train的，所以這部分是CNN_encoder。
- RNN_decoder就用回助教給的範例code，沒有更動
- 就結果來看，其實還蠻不錯的，vgg簡單暴力就有0.98的acc，可以的話可以是看看DenseNet也許會有更好的效果。
