*Đọc kĩ markdown*

In [1]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
import string
import os
from PIL import Image
import glob
from pickle import dump, load
from time import time


In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers import Bidirectional
from keras.layers import add
from keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.utils import load_img, img_to_array
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [3]:
path_to_desc = "Flickr8k/Flickr8k.token.txt"

In [4]:
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

doc = load_doc(path_to_desc)

In [5]:
# Lưu caption dưới dạng key value: id_image : ['caption 1', 'caption 2', 'caption 3',' caption 4', 'caption 5']
def load_descriptions(doc):
	mapping = dict()
	# process lines
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		if len(line) < 2:
			continue
		# take the first token as the image id, the rest as the description
		image_id, image_desc = tokens[0], tokens[1:]
		# extract filename from image id
		image_id = image_id.split('.')[0]
		# convert description tokens back to string
		image_desc = ' '.join(image_desc)
		# create the list if needed
		if image_id not in mapping:
			mapping[image_id] = list()
		# store description
		mapping[image_id].append(image_desc)
	return mapping

descriptions = load_descriptions(doc)

In [6]:
# Hàm tiền xử lý các câu mô tả như: đưa về chữ thường, bỏ dấu, bỏ các chữ số....
def clean_descriptions(descriptions):
	# prepare translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for key, desc_list in descriptions.items():
		for i in range(len(desc_list)):
			desc = desc_list[i]
			# tokenize
			desc = desc.split()
			# convert to lower case
			desc = [word.lower() for word in desc]
			# remove punctuation from each token
			#desc = [w.translate(table) for w in desc]
			# remove hanging 's' and 'a'
			#desc = [word for word in desc if len(word)>1]
			# remove tokens with numbers in them
			desc = [word for word in desc if word.isalpha()]
			# store as string
			desc_list[i] =  ' '.join(desc)

clean_descriptions(descriptions)

In [None]:
def tokenize(descriptions):
    new_descriptions = []
    for key, val in descriptions.items():
        for string in val:
            new_val = tokenize_function(list(string), tokenizer, 200)
            new_descriptions.append((key, new_val["input_ids"]))
    return new_descriptions

new_descriptions = tokenize(descriptions)


In [7]:
def save_descriptions(descriptions, filename):
	lines = list()
	for key, desc_list in descriptions.items():
		for desc in desc_list:
			lines.append(key + ' ' + desc)
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [8]:
path_to_train_id_file = 'Flickr8k/Flickr_8k.trainImages.txt'
path_to_val_id_file = 'Flickr8k/Flickr_8k.devImages.txt'
path_to_test_id_file ='Flickr8k/Flickr_8k.testImages.txt'
path_to_image_folder = 'Flickr8k/Flicker8k_Dataset/'

In [69]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Định nghĩa max_length
max_len = 256

# Hàm xử lý dữ liệu
def tokenize_function(en_texts, tokenizer, max_len):
    return tokenizer(en_texts, padding="max_length",
                    truncation=True,
                    max_length=max_len,
                    is_split_into_words=True)


In [9]:
def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	# process line by line
	for line in doc.split('\n'):
		# skip empty lines
		if len(line) < 1:
			continue
		# get the image identifier
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

train_id = load_set(path_to_train_id_file)
val_id = load_set(path_to_val_id_file)
test_id = load_set(path_to_test_id_file)

In [10]:
# Lấy lấy các ảnh jpg trong thư mục chứa toàn bộ ảnh Flickr
all_train_images_in_folder = glob.glob(path_to_image_folder + '*.jpg')

In [11]:
train_images = set(open(path_to_train_id_file, 'r').read().strip().split('\n'))
# Danh sách ấy sẽ lưu full path vào biến train_img
train_img = []
for image in all_train_images_in_folder: # Duyệt qua tất cả các file trong folder
    image_name = os.path.basename(image)
    if image_name in train_images: # Nếu tên file của nó thuộc training set
        train_img.append(image) # Thì thêm vào danh sách ảnh sẽ dùng để train

In [12]:
test_images = set(open(path_to_test_id_file, 'r').read().strip().split('\n'))
test_img = []
for image in all_train_images_in_folder: 
    image_name = os.path.basename(image)
    if image_name in test_images:
        test_img.append(image) 

In [13]:
val_images = set(open(path_to_val_id_file, 'r').read().strip().split('\n'))
val_img = []
for image in all_train_images_in_folder:
    image_name = os.path.basename(image)
    if image_name in val_images:
        val_img.append(image) 

In [14]:
# Hàm đọc mô tả từ file  và Thêm 'startseq', 'endseq' cho chuỗi
def load_clean_descriptions(filename, dataset):
	# load document
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		# split line by white space
		tokens = line.split()
		# split id from description
		image_id, image_desc = tokens[0], tokens[1:]
		# skip images not in the set
		if image_id in dataset:
			# create list
			if image_id not in descriptions:
				descriptions[image_id] = list()
			# wrap description in tokens
			# desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			desc = ' '.join(image_desc)
			# store
			descriptions[image_id].append(desc)
	return descriptions

train_descriptions = load_clean_descriptions('descriptions.txt', train_id)
val_descriptions = load_clean_descriptions('descriptions.txt', val_id)
test_descriptions = load_clean_descriptions('descriptions.txt', test_id)

In [15]:
# Tạo list các training caption
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
for key, val in val_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
for key, val in test_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)       
len(all_train_captions)

40000

In [16]:
# Chỉ lấy các từ xuất hiện trên 1 lần
word_count_threshold = 1
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

In [17]:
# Tạo từ điển map từ Word sang Index và ngược lại
ixtoword = {}
wordtoix = {}

ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

# Tính toán bảng từ vựng
vocab_size = len(ixtoword) + 1 # Thêm 1 cho từ dùng để padding

In [18]:
# Thêm token đặc biệt
start_token = '<START>'
end_token = '<END>'
wordtoix[start_token] = len(wordtoix)+1
wordtoix[end_token] = len(wordtoix)+1
ixtoword[len(wordtoix)-1] = start_token
ixtoword[len(wordtoix)] = end_token

In [19]:
# Chuyển all_train_captions sang dạng số
train_sequences = []  # Lưu trữ các chú thích dưới dạng số
for caption in all_train_captions:
    seq = []
    seq.append(wordtoix[start_token])  # Thêm token bắt đầu
    for word in caption.split(' '):  # Duyệt từng từ trong chú thích
        if word in wordtoix:  # Nếu từ có trong từ điển
            seq.append(wordtoix[word])
        else:
            continue  # Bỏ qua từ không có trong từ điển
    seq.append(wordtoix[end_token])  # Thêm token kết thúc
    train_sequences.append(seq)


In [20]:
def captions_to_sequences(descriptions, wordtoix, start_token="<start>", end_token="<end>"):
    sequences = {}
    for image_id, captions in descriptions.items():
        encoded_captions = []
        for caption in captions:
            # Thêm token bắt đầu và kết thúc
            seq = [wordtoix[start_token]] + \
                  [wordtoix[word] for word in caption.split() if word in wordtoix] + \
                  [wordtoix[end_token]]
            encoded_captions.append(seq)
        sequences[image_id] = encoded_captions
    return sequences


In [21]:
train_descriptions = captions_to_sequences(train_descriptions, wordtoix,start_token, end_token)
val_descriptions = captions_to_sequences(val_descriptions, wordtoix, start_token, end_token)
test_descriptions = captions_to_sequences(test_descriptions, wordtoix, start_token, end_token)


In [22]:
max_length_train = max(len(seq) for captions in train_descriptions.values() for seq in captions)
max_length_test = max(len(seq) for captions in test_descriptions.values() for seq in captions)
max_length_val = max(len(seq) for captions in val_descriptions.values() for seq in captions)
max_length = max(max_length_train, max_length_test, max_length_val)

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def add_padding(descriptions, max_length):
    padded_descriptions = {}
    for image_id, captions in descriptions.items():
        padded_captions = pad_sequences(captions, maxlen=max_length, padding='post', truncating='post')
        padded_descriptions[image_id] = padded_captions
    return padded_descriptions

In [24]:
def decode_sequence(sequence, ixtoword = ixtoword):
    words = []
    for idx in sequence:
        if idx == 0:  # Bỏ qua padding
            continue
        word = ixtoword.get(idx)
        if word in ['<START>', '<END>']:
            continue
        words.append(word)
    return ' '.join(words)


In [25]:
train_descriptions = add_padding(train_descriptions, max_length)
val_descriptions = add_padding(val_descriptions, max_length)
test_descriptions = add_padding(test_descriptions, max_length)

In [26]:
# Lưu train_descriptions
with open('train_descriptions.pkl', 'wb') as train_file:
    dump(train_descriptions, train_file)

# Lưu val_descriptions
with open('val_descriptions.pkl', 'wb') as val_file:
    dump(val_descriptions, val_file)

# Lưu test_descriptions
with open('test_descriptions.pkl', 'wb') as test_file:
    dump(test_descriptions, test_file)


In [32]:
#Test
with open('test_descriptions.pkl', 'rb') as file:
    data = load(file)
print(decode_sequence(list(data.values())[0][0]))

a blond woman in a blue shirt appears to wait for a ride


*Run đến đây thôi nha*

In [181]:
# Chuyển thành từng dòng
def to_lines(descriptions):
	all_desc = list()
	for key in descriptions.keys():
		[all_desc.append(d) for d in descriptions[key]]
	return all_desc

In [None]:
# Tính toán độ dài nhất của câu mô tả
def max_length(descriptions):
	lines = to_lines(descriptions)
	return max(len(d.split()) for d in lines)

max_length = max_length(train_descriptions)

In [None]:
# Hàm load ảnh, resize về khích thước mà Inception v3 yêu cầu.
def preprocess(image_path):
    # Resize ảnh về 299x299 làm đầu vào model
    img = load_img(image_path, target_size=(299, 299))
    x = img_to_array(img)
    # Thêm một chiều  nữa
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [None]:
# Khởi tạo Model INception v3 để tạo ra feauture cho các ảnh của chúng ta
model = InceptionV3(weights='imagenet')
model_new = Model(model.input, model.layers[-2].output)

In [None]:
# Hàm biến ảnh đầu vào thành vector features (2048, )
def encode(image):
    image = preprocess(image) # preprocess the image
    fea_vec = model_new.predict(image) # Get the encoding vector for the image
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )
    return fea_vec

In [None]:
# Lặp một vòng qua các ảnh train và biến hết thành vector features
encoding_train = {}
for img in train_img:
    encoding_train[os.path.basename(img)] = encode(img)
# Lưu image embedding train lại
with open("encoded_train_images.pkl", "wb") as encoded_pickle:
    dump(encoding_train, encoded_pickle)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 185ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [None]:
# Lặp một vòng qua các ảnh test và biến hết thành vector features
encoding_test = {}
for img in test_img:
    encoding_test[os.path.basename(img)] = encode(img)
with open("encoded_test_images.pkl", "wb") as encoded_pickle:
    dump(encoding_test, encoded_pickle)

train_features = load(open("encoded_train_images.pkl", "rb"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
encoding_val = {}
for img in val_img:
    encoding_val[os.path.basename(img)] = encode(img)
with open("encoded_val_images.pkl", "wb") as encoded_pickle:
    dump(encoding_val, encoded_pickle)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
# Tải model Glove để embeding word
glove_dir = ''
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_dim = 200

In [None]:
embedding_dim = 200
# Tạo ma trận embeding cho bảng từ vững, mỗi từ embeding bằng 1 vector 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Lặp qua các từ trong danh sách từ
for word, i in wordtoix.items():
    # Lấy embeding của Glove gán vào embeding vector
    embedding_vector = embeddings_index.get(word)
    # Nếu như không None thì gán vào mảng Maxtrix
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector