In [1]:
import json
import random
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence

In [2]:
DAE_JSON = '/kaggle/input/dae-dataset/dae_dataset.json'
with open(DAE_JSON, 'r', encoding='utf8') as f:
    dae_data = json.load(f)

In [3]:
with open('/kaggle/input/food-iac-fine-tune-dataset/preprocessed_dataset/wordmap_all.json', 'r') as f:
    word_map = json.load(f)

In [4]:
def tokenize(sentence, word_map):
    # Tokenizer simple, kamu bisa tambahkan pre-cleaning jika mau
    tokens = sentence.lower().split()
    return [word_map.get(w, word_map['<unk>']) for w in tokens]

In [5]:
input_seqs, target_seqs = [], []
for sample in dae_data:
    # Gabung aspek, bisa tambahkan tag [aspect] kalau mau
    input_txt = ' '.join([
        f"[general_impression] {sample['captions']['general_impression']}",
        f"[subject] {sample['captions']['subject']}",
        f"[use_of_camera] {sample['captions']['use_of_camera']}",
        f"[color_light] {sample['captions']['color_light']}",
        f"[composition] {sample['captions']['composition']}",
        f"[dof_and_focus] {sample['captions']['dof_and_focus']}",
    ])
    # Target: ambil caption ke-1 (atau random)
    target_txt = sample['reference'][0] if sample['reference'] else ''

    # <start> ... <end>
    input_ids = [word_map['<start>']] + tokenize(input_txt, word_map) + [word_map['<end>']]
    target_ids = [word_map['<start>']] + tokenize(target_txt, word_map) + [word_map['<end>']]
    input_seqs.append(torch.tensor(input_ids, dtype=torch.long))
    target_seqs.append(torch.tensor(target_ids, dtype=torch.long))

In [6]:
input_seqs_pad = pad_sequence(input_seqs, batch_first=True, padding_value=word_map['<pad>'])
target_seqs_pad = pad_sequence(target_seqs, batch_first=True, padding_value=word_map['<pad>'])

In [7]:
torch.save({
    'input_seqs': input_seqs_pad,
    'target_seqs': target_seqs_pad,
    'word_map': word_map
}, 'dae_preprocessed.pt')

In [8]:
import torch

# Load dataset
dae_data = torch.load('/kaggle/working/dae_preprocessed.pt')
input_seqs = dae_data['input_seqs']
target_seqs = dae_data['target_seqs']
word_map = dae_data['word_map']
inv_word_map = {v: k for k, v in word_map.items()}

print(f"Input tensor shape: {input_seqs.shape}")   # (num_samples, max_input_len)
print(f"Target tensor shape: {target_seqs.shape}") # (num_samples, max_target_len)

# Fungsi untuk decode tensor ke teks
def decode(tokens):
    words = []
    for idx in tokens:
        word = inv_word_map.get(idx.item(), '<unk>')
        if word == '<end>':
            break
        if word not in ['<pad>', '<start>']:
            words.append(word)
    return ' '.join(words)

# Debug: tampilkan 5 contoh random
import random
idxs = random.sample(range(len(input_seqs)), 5)
for i in idxs:
    print(f"\nSAMPLE #{i}")
    print("Input token ids:  ", input_seqs[i].tolist())
    print("Target token ids: ", target_seqs[i].tolist())
    print("Decoded Input:    ", decode(input_seqs[i]))
    print("Decoded Target:   ", decode(target_seqs[i]))

# Statistika singkat
total = input_seqs.shape[0]
unk_input = sum((input_seqs == word_map['<unk>']).sum(dim=1) for _ in range(total))
unk_target = sum((target_seqs == word_map['<unk>']).sum(dim=1) for _ in range(total))
print(f"\nJumlah sample: {total}")
print(f"Jumlah total <unk> di input:  {int((input_seqs == word_map['<unk>']).sum())}")
print(f"Jumlah total <unk> di target: {int((target_seqs == word_map['<unk>']).sum())}")
print(f"Rata-rata panjang input:  {float((input_seqs != word_map['<pad>']).sum()/total):.2f}")
print(f"Rata-rata panjang target: {float((target_seqs != word_map['<pad>']).sum()/total):.2f}")

# Cari sequence yang kosong/aneh
for i in range(total):
    if (input_seqs[i] == word_map['<start>']).sum() == 0 or (target_seqs[i] == word_map['<start>']).sum() == 0:
        print(f"[WARNING] Sample {i} ada sequence tanpa <start> token.")
    if (input_seqs[i] == word_map['<end>']).sum() == 0 or (target_seqs[i] == word_map['<end>']).sum() == 0:
        print(f"[WARNING] Sample {i} ada sequence tanpa <end> token.")

print("\nDebug selesai!")

Input tensor shape: torch.Size([7221, 178])
Target tensor shape: torch.Size([7221, 757])

SAMPLE #789
Input token ids:   [8840, 8839, 3826, 4470, 152, 3845, 255, 152, 8839, 4117, 8015, 5038, 7982, 4792, 8839, 4140, 6899, 3826, 4470, 152, 8839, 5295, 152, 3310, 4117, 1, 183, 5001, 5295, 152, 3310, 4117, 1, 183, 5001, 5295, 152, 3310, 255, 152, 833, 4117, 1, 183, 5001, 8839, 3826, 4470, 152, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 8839, 3826, 4617, 152, 8839, 8839, 152, 4480, 4117, 1, 694, 3595, 8839, 3826, 4470, 152, 3845, 5295, 152, 8839, 255, 152, 8839, 4117, 1, 183, 2084, 8839, 3826, 4470, 152, 1434, 255, 152, 1434, 355, 1, 183, 1924, 255, 152, 3827, 4117, 1, 183, 8015, 5038, 8841, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Target token ids:  [884

In [9]:
# Per sample (input dan target)
unk_input_counts = (input_seqs == word_map['<unk>']).sum(dim=1)
unk_target_counts = (target_seqs == word_map['<unk>']).sum(dim=1)
input_lens = (input_seqs != word_map['<pad>']).sum(dim=1)
target_lens = (target_seqs != word_map['<pad>']).sum(dim=1)

print(f"Rata-rata <unk> di input: {unk_input_counts.float().mean():.2f} per sample ({(unk_input_counts/input_lens).float().mean()*100:.2f}%)")
print(f"Rata-rata <unk> di target: {unk_target_counts.float().mean():.2f} per sample ({(unk_target_counts/target_lens).float().mean()*100:.2f}%)")

Rata-rata <unk> di input: 11.08 per sample (10.72%)
Rata-rata <unk> di target: 0.76 per sample (2.36%)


In [10]:
print(f"Input min/max/mean length: {input_lens.min()}, {input_lens.max()}, {input_lens.float().mean():.1f}")
print(f"Target min/max/mean length: {target_lens.min()}, {target_lens.max()}, {target_lens.float().mean():.1f}")

Input min/max/mean length: 52, 178, 104.9
Target min/max/mean length: 7, 757, 34.5


In [12]:
max_target_len = 128
new_input, new_target = [], []
for inp, tgt in zip(input_seqs, target_seqs):
    if (tgt != word_map['<pad>']).sum() <= max_target_len:
        new_input.append(inp)
        new_target.append(tgt)
# Lalu pad ulang:
input_seqs_pad = pad_sequence(new_input, batch_first=True, padding_value=word_map['<pad>'])
target_seqs_pad = pad_sequence(new_target, batch_first=True, padding_value=word_map['<pad>'])
print("Setelah filtering, jumlah sample:", input_seqs_pad.shape[0])

Setelah filtering, jumlah sample: 6891
