In [1]:
CAPTIONS_PATH = "Flickr8k/Flickr8k.token.txt"

In [2]:
def getRawImageWith5Captions():
    items = dict()
    with open(CAPTIONS_PATH, "r") as f:
        raw_data = f.read()
    f.close()
    lines = raw_data.split("\n")
    for line in lines:
        if len(line) > 0:
            img_path, caption = line.split("\t")
            img_path = img_path.split("#")[0]
            if img_path not in items:
                items[img_path] = []
            caption = caption.lower()
            caption = caption.strip(" .")
            # caption = '<START> ' + caption + ' <END>'
            items[img_path].append(caption)
    return items

In [3]:
raw_data = getRawImageWith5Captions()

In [4]:
all_train_img_paths = "Flickr8k\Flickr_8k.trainImages.txt"
all_test_img_paths = "Flickr8k\Flickr_8k.testImages.txt"
all_val_img_paths = "Flickr8k\Flickr_8k.devImages.txt"

In [6]:
with open(all_train_img_paths, 'r') as f:
    all_train_img = f.read().split("\n")

with open(all_test_img_paths, 'r') as f:
    all_test_img = f.read().split("\n")

with open(all_val_img_paths, 'r') as f:
    all_val_img = f.read().split("\n")

In [7]:
train_raw_data = dict()
for img in all_train_img:
    if img in raw_data:
        train_raw_data[img] = raw_data[img]

test_raw_data = dict()
for img in all_test_img:
    if img in raw_data:
        test_raw_data[img] = raw_data[img]

val_raw_data = dict()
for img in all_val_img:
    if img in raw_data:
        val_raw_data[img] = raw_data[img]

In [8]:
train_captions = []
for image in train_raw_data:
    for caption in train_raw_data[image]:
        train_captions.append(caption)

In [18]:
word_freq = {}
for caption in train_captions:
    for word in caption.split(' '):
        word_freq[word] = word_freq.get(word, 0) + 1
min_freq = 5
special_tokens = ["<PAD>", "<START>", "<END>", "<UNK>"]
vocab = {token: idx for idx, token in enumerate(special_tokens)}
decode_vocab = {idx: token for idx, token in enumerate(special_tokens)}
idx = 4
for word, freq in word_freq.items():
    if freq >= min_freq and word not in special_tokens:
        vocab[word] = idx
        decode_vocab[idx] = word
        idx += 1

In [44]:
test_len = 5
test = [1,2]
if len(test) >= test_len:
    test = test[:test_len - 1]
    test.append(2)
elif len(test) < test_len:
    test.append(2)
    while len(test) < test_len:
        test.append(0)
test

[1, 2, 2, 0, 0]

In [45]:
max_len = 15
def make_sequence(raw_data):
    sequences = {}
    for image, captions in raw_data.items():
        sequences[image] = []
        for caption in captions:
            sequence = [1]
            for word in caption.split(' '):
                if word not in vocab:
                    word = "<UNK>"
                sequence.append(vocab[word])
            if len(sequence) >= max_len:
                sequence = sequence[:max_len - 1]
                sequence.append(2)
            elif len(sequence) < max_len:
                sequence.append(2)
                while(len(sequence) < max_len):
                    sequence.append(0)
            sequences[image].append(sequence)
    return sequences
            

In [46]:
train_sequences = make_sequence(train_raw_data)
val_sequences = make_sequence(val_raw_data)
test_sequences = make_sequence(test_raw_data)

In [54]:
def decode(sequence):
    result = ''
    for word in sequence:
        if decode_vocab[word] not in [token for token in special_tokens if token != "<UNK>"]:
            result += decode_vocab[word] + ' '
        else:
            continue
    result.strip()
    
    return result   

In [55]:
for seq in train_sequences['3457856049_2de173e818.jpg']:
    print(decode(seq))

a boy is creating large splashes whilst swimming in the ocean 
a boy with black hair and dark <UNK> is swimming in murky water 
a child splashes in a lake 
a little boy jumped into the water and made a big splash 
a young boy falling into a body of water 


In [56]:
import json
import pickle
import os
if not os.path.exists('Processed Data'):
    os.mkdir('Processed Data')
# Lưu vocab
with open('Processed Data/vocab.json', 'w') as f:
    json.dump(vocab, f)

with open('Processed Data/decode_vocab.json', 'w') as f:
    json.dump(decode_vocab, f)

In [57]:
# Lưu sequences đã xử lý
with open('Processed Data/train_sequences.pkl', 'wb') as f:
    pickle.dump(train_sequences, f)

with open('Processed Data/val_sequences.pkl', 'wb') as f:
    pickle.dump(val_sequences, f)

with open('Processed Data/test_sequences.pkl', 'wb') as f:
    pickle.dump(test_sequences, f)

In [58]:
metadata = {
    'vocab_size': len(vocab),
    'max_length': max_len,
    'min_freq': min_freq,
    'train_size': len(train_sequences),
    'val_size': len(val_sequences),
    'test_size': len(test_sequences)
}

with open('Processed Data/metadata.json', 'w') as f:
    json.dump(metadata, f)