# ch3.caption using RNN & EMBEDDING 

In [38]:
import os
annotation_dir = '/Users/jiae/Documents/tf2_real_project/ch3/Flickr8k_text'

In [85]:
def read_file(file_name):
    with open(os.path.join(annotation_dir,file_name),'rb') as file_handle:
        file_lines = file_handle.read().splitlines()
    return file_lines

In [86]:
train_image_paths = read_file('Flickr_8k.trainImages.txt')
test_image_paths = read_file('Flickr_8k.testImages.txt')
captions = read_file('Flickr8k.token.txt')

print(len(train_image_paths))
print(len(test_image_paths))
print(len(captions))

6000
1000
40460


In [51]:
def read_file_str(file_name):
    with open(os.path.join(annotation_dir,file_name),'r') as file_handle:
        file_lines = file_handle.read().splitlines()
    return file_lines
train_image_paths_str = read_file_str('Flickr_8k.trainImages.txt')
test_image_paths_str = read_file_str('Flickr_8k.testImages.txt')

In [48]:
def get_vocab():
    image_caption_map = {}
    unique_words1 = set()
    max_words = 0
    for caption in captions:
        caption = caption.decode("utf-8")
        image_name = caption.split("#")[0]
        
        image_caption = caption.split("#")[1].split('\t')[1]
        
        if image_name not in image_caption_map.keys():
            image_caption_map[image_name] = [image_caption]
        else:
            image_caption_map[image_name].append(image_caption)
        caption_words = image_caption.split()
        max_words = max(max_words, len(caption_words))
        [unique_words1.add(caption_word) for caption_word in caption_words]
        
        unique_words = list(unique_words1)
        word_to_index_map = {}
        index_to_word_map = {}
        for index, unique_word in enumerate(unique_words):
            word_to_index_map[unique_word] = index
            index_to_word_map[index] = unique_words
        #print(max_words)
    return image_caption_map,max_words,unique_words,word_to_index_map,index_to_word_map

In [42]:
import tensorflow as tf
import pickle
import os
import numpy as np

In [66]:
class ImageModel:
    
    def __init__(self):
        vgg_model = tf.keras.applications.vgg16.VGG16(
    include_top=True, weights='imagenet')
#         inputs = tf.keras.Input(shape=(224,224))
#         outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
        self.model = tf.keras.Model(inputs=vgg_model.input, 
                                    outputs=vgg_model.get_layer('fc2').output)
    
    @staticmethod
    def load_preprocess_image(image_path):
        image_array = tf.keras.preprocessing.image.load_img(image_path,
                                                           target_size=(224,224))
        image_array = tf.keras.preprocessing.image.img_to_array(image_array)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    
    def extract_feature_from_image_path(self,image_path):
        image_array = self.load_preprocess_image(image_path)
        features = self.model.predict(image_array)
        return features.reshape((4096,1))
    
    def extract_feature_from_image_paths(self,work_dir,image_names):
        features = {}
        for image_name in image_names:
            image_path = os.path.join(work_dir,image_name)
            feature = self.extract_feature_from_image_path(image_path)
            image_id = image_name.decode('utf-8')
            features[image_id] = feature
        return features
    
    def extract_features_and_save(self,work_dir,image_names,file_name):
        features = self.extract_feature_from_image_paths(work_dir,image_names)
        
        with open(file_name,'wb') as p:
            pickle.dump(features,p)
        
    

In [67]:
I = ImageModel()
I.extract_features_and_save(b'Flicker8k_Dataset',train_image_paths,
                           'train_image_features.p')
I.extract_features_and_save(b'Flicker8k_Dataset',test_image_paths,
                           'test_image_features.p')

In [70]:
with open('train_image_features.p', 'rb') as p:
    sample = pickle.load(p)

### dataset 만들기 

In [87]:
image_caption_map,max_words,unique_words,word_to_index_map,index_to_word_map = get_vocab()
vocabulary_size = len(unique_words)

In [94]:
import string

def clean_description(descriptions):
    table = str.maketrans('','',string.punctuation)
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            desc = desc.split()
            desc = [word.lower() for word in desc]
            desc = [w.translate(table) for w in desc]
            desc = [word for word in desc if len(word)>1]
            desc = [word for word in desc if word.isalpha()]
            desc_list[i] = ' '.join(desc)
    return descriptions

clean_image_caption_map = clean_description(image_caption_map)

In [97]:
set(clean_image_caption_map.items())

TypeError: unhashable type: 'list'

In [95]:
def to_vocabulary(descriptions):
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(d.spilt()) for d in descriptions[key]]
    return all_desc
vocabulary = to_vocabulary(clean_image_caption_map)

AttributeError: 'str' object has no attribute 'spilt'

In [56]:
train_y = {}
for img_idx in train_image_paths_str:
    captions = image_caption_map[img_idx]
    dic = {}
    for i,caption in enumerate(captions):
        words = caption.split()
        y = [word_to_index_map[w] for w in words]
        y_one_hot = tf.one_hot(y,vocabulary_size)
#         dic[i] = y

#     train_y[img_idx] = dic

In [61]:
y_one_hot

<tf.Tensor: shape=(16, 9627), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [59]:
print(y_one_hot.shape)
print(len(y))

(16, 9627)
16


In [91]:
image_model = tf.keras.Sequential()
image_model.add(tf.keras.layers.Dense(128, input_dim = 4096, activation = 'relu'))
image_model.add(tf.keras.layers.RepeatVector(max_words))
image_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               524416    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 38, 128)           0         
Total params: 524,416
Trainable params: 524,416
Non-trainable params: 0
_________________________________________________________________


In [92]:
lang_model = tf.keras.Sequential()
lang_model.add(tf.keras.layers.Embedding(vocabulary_size, 256, input_length=max_words))
lang_model.add(tf.keras.layers.LSTM(256, return_sequences = True))
lang_model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128)))
lang_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 38, 256)           2464512   
_________________________________________________________________
lstm (LSTM)                  (None, 38, 256)           525312    
_________________________________________________________________
time_distributed (TimeDistri (None, 38, 128)           32896     
Total params: 3,022,720
Trainable params: 3,022,720
Non-trainable params: 0
_________________________________________________________________


In [93]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Concatenate([image_model, lang_model]))
model.add(tf.keras.layers.LSTM(1000,return_sequences = False))
model.add(tf.keras.layers.Dense(vocabulary_size,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer = tf.keras.optimizers.RMSprop(learning_rate = 0.1),metrics = ['accuracy'])

model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.