# QnA GRU

In [None]:
!pip install -q rouge_score evaluate

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incomp

In [2]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, get_scheduler
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import evaluate
from tqdm.auto import tqdm

2025-05-11 12:19:53.384412: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746965993.625126      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746965993.688516      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Dataset
Fungsi-fungsi ini untuk membantu memuat dan membagi data untuk pelatihan dan evaluasi. 
1. `load_data`: memuat dataset dari Hugging Face Hub dengan filter kolom 'lang' dan dibatasi jumlah baris. Hasil dari fungsi ini adalah Dataframe.
2. `split_dataframe`: Membagi data menjadi data latih dan validasi secara acak sesuai proporsi yang ditentukan.
3. `QADataset`: Mengubah DataFrame menjadi dataset PyTorch yang siap digunakan untuk DataLoader. Setiap sampel terdiri dari pertanyaan, konteks (300 karakter pertama), dan jawaban, yang kemudian di-tokenisasi sesuai format generatif.

In [3]:
  
def load_data(
    dataset_name: str = "lib3m/lib3m_qa_dataset_v1",
    split: str = "train",
    lang: str = "en",
    row:int = 100000
) -> pd.DataFrame:
    ds = load_dataset(dataset_name, split=split)
    df = ds.to_pandas()
    df = df[df.language == lang].reset_index(drop=True)[:row]
    return df


def split_dataframe(
    df,
    test_size: float = 0.2,
    random_state: int = 42
) -> tuple:

    train_df, val_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        shuffle=True
    )
    
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

class QADataset(Dataset):
    def __init__(
        self,
        dataframe,
        tokenizer: AutoTokenizer,
        max_length: int = 512,
    ):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        question = row['question']
        content = row['content'][:300] # take 300 first characters 
        answer = row['answer']

        # Generative QA
        text = f"<question> {question} <context> {content} <answer>"
        tokenized = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        labels = self.tokenizer(
            answer,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        ).input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            'input_ids': tokenized.input_ids.squeeze(),
            'attention_mask': tokenized.attention_mask.squeeze(),
            'labels': labels.squeeze()
        }

## Model
Model GRUGenerator untuk tugas Question Answering generatif. Model ini terdiri dari tiga komponen utama:
- Embedding Layer: Mengubah token input menjadi representasi vektor berdimensi embed_dim.
- GRU Layer: Memproses urutan embedding menggunakan beberapa layer GRU untuk menangkap informasi sekuensial.
- Linear Layer: Mengubah output GRU menjadi prediksi token pada setiap langkah waktu.

In [None]:
class GRUGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim=768, hidden_dim=768, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        outputs, _ = self.gru(x)
        logits = self.fc(outputs)
        return logits
    
    def forward_step(self, last_token_ids, hidden):
        """Untuk inference: hanya satu timestep"""
        emb = self.embedding(last_token_ids)                  # [batch, 1, embed_dim]
        out, hidden = self.gru(emb, hidden)                   # out: [batch,1,hidden]
        logits = self.fc(out[:, -1, :])                       # [batch, vocab]
        return logits, hidden

## Configurations

In [5]:
MODEL_DIR = '/kaggle/working/gru_model'
BATCH_SIZE = 16
EPOCHS = 2
LR = 1e-3
MAX_LEN = 256
NUM_LAYERS=5
EMBED_DIM=512
HIDDEN_DIM=768
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Use both GPUs
if torch.cuda.device_count() > 1:
    MULTI_GPU = True
else:
    MULTI_GPU = False

## Data Preparation
- Mempersiapkan dataset untuk pelatihan dan evaluasi.
- Menambahkan juga token khusus untuk format QnA (`<question>`, `<context>`, `<answer>`).
- Menyiapkan DataLoader agar data dapat di-batch dan di-shuffle secara efisien selama proses pelatihan dan evaluasi.

In [6]:
df = load_data()
train_df, val_df = split_dataframe(df)
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'additional_special_tokens': ['<question>', '<context>', '<answer>']})

train_ds = QADataset(train_df, tokenizer, max_length=MAX_LEN)
val_ds = QADataset(val_df, tokenizer, max_length=MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True,num_workers=4)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=4)

README.md:   0%|          | 0.00/9.64k [00:00<?, ?B/s]

qa_pairs.parquet:   0%|          | 0.00/724M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/337525 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## Training
Kode ini menjalankan proses pelatihan untuk model generatif berbasis GRU (GRUGenerator). 
- Inisialisasi Model dan Optimizer: Model GRU diinisialisasi dan dijalankan di GPU. Jika multi-GPU diaktifkan, model dibungkus dengan DataParallel. Optimizer yang digunakan adalah AdamW, dan scheduler linear digunakan untuk mengatur learning rate selama pelatihan.
- Perhitungan Loss: Digunakan CrossEntropyLoss dengan ignore_index=-100 untuk menghindari penalti pada token padding saat menghitung loss.
- Pelatihan per Epoch: Untuk setiap epoch, model dijalankan dalam mode training, dan setiap batch diproses untuk menghitung loss, backward propagation, dan update parameter. Rata-rata loss dicetak di akhir setiap epoch.
- Penyimpanan Model: Checkpoint disimpan setiap epoch dan model akhir disimpan di MODEL_DIR, bersama dengan tokenizer agar dapat digunakan kembali pada tahap inferensi.

In [7]:
model = GRUGenerator(len(tokenizer), EMBED_DIM, HIDDEN_DIM, NUM_LAYERS).to(DEVICE)
if MULTI_GPU:
    model = nn.DataParallel(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=-100)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0,
                          num_training_steps=EPOCHS * len(train_loader))

os.makedirs(MODEL_DIR, exist_ok=True)
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
    for batch in loop:
        input_ids = batch['input_ids'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        optimizer.zero_grad()
        logits = model(input_ids)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")
    torch.save(model.module.state_dict() if MULTI_GPU else model.state_dict(),
               f"{MODEL_DIR}/checkpoint_epoch{epoch+1}.pt")
# Save final
torch.save(model.module.state_dict() if MULTI_GPU else model.state_dict(), f"{MODEL_DIR}/final.pt")
tokenizer.save_pretrained(MODEL_DIR)

Epoch 1/2:   0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 1/2, Loss: 6.9887


Epoch 2/2:   0%|          | 0/5000 [00:00<?, ?it/s]

Epoch 2/2, Loss: 6.8934


('/kaggle/working/gru_model/tokenizer_config.json',
 '/kaggle/working/gru_model/special_tokens_map.json',
 '/kaggle/working/gru_model/vocab.json',
 '/kaggle/working/gru_model/merges.txt',
 '/kaggle/working/gru_model/added_tokens.json',
 '/kaggle/working/gru_model/tokenizer.json')

## Evaluation
Kode ini melakukan evaluasi batch-wise terhadap model GRU generatif untuk tugas QnA dengan strategi top-k sampling dan menghitung kualitas output menggunakan metrik ROUGE. Prosesnya mencakup pemrosesan prompt sekaligus batch, caching hidden state GRU, sampling token berikutnya secara random dari k token teratas, dan menghentikan generasi begitu semua sampel mengeluarkan token akhir (eos_token_id) 

In [None]:
metric = evaluate.load('rouge')
model.eval()
preds, refs = [], []
top_k = 100

for batch in tqdm(val_loader, desc="Evaluasi Batch"):
    input_ids = batch['input_ids'].to(DEVICE)      # [B, L]
    labels    = batch['labels'].to(DEVICE)

    B = input_ids.size(0)
    generated = torch.full((B, 0), tokenizer.pad_token_id,
                           device=DEVICE, dtype=torch.long)
    finished = torch.zeros(B, dtype=torch.bool, device=DEVICE)

    with torch.no_grad():
        emb = model.module.embedding(input_ids)
        _, hidden = model.module.gru(emb)

    last_tokens = input_ids[:, -1].unsqueeze(1)    

    for _ in range(MAX_LEN):
        logits, hidden = model.module.forward_step(last_tokens, hidden)
        topk_logits, topk_idx = torch.topk(logits, top_k, dim=-1)
        probs   = F.softmax(topk_logits, dim=-1)
        sampled = torch.multinomial(probs, num_samples=1)
        next_tokens = topk_idx.gather(-1, sampled)   # [B,1]

        generated = torch.cat([generated, next_tokens], dim=1)
        eos_mask  = next_tokens.squeeze(1) == tokenizer.eos_token_id
        finished |= eos_mask
        if finished.all():                          
            break

        last_tokens = next_tokens

    for i in range(B):
        gen_ids = generated[i].tolist()
        if tokenizer.eos_token_id in gen_ids:
            gen_ids = gen_ids[:gen_ids.index(tokenizer.eos_token_id)]
        preds.append(tokenizer.decode(gen_ids, skip_special_tokens=True))
        refs.append(tokenizer.decode(labels[i][labels[i]!=-100].cpu(),
                                     skip_special_tokens=True))

# 5. Hitung ROUGE
res = metric.compute(predictions=preds, references=refs)
print("ROUGE:", res)


Evaluasi Batch:   0%|          | 0/1250 [00:00<?, ?it/s]

ROUGE: {'rouge1': 0.2004913654522985, 'rouge2': 0.014785439582633854, 'rougeL': 0.13716848946214236, 'rougeLsum': 0.13718423050049133}


## Testing
Kode ini bertujuan untuk menguji kemampuan model GRU dalam menghasilkan jawaban berdasarkan pertanyaan dan konteks yang diberikan, menggunakan teknik top-k sampling untuk meningkatkan variasi dalam hasil generasi.

In [None]:
sample = val_df.iloc[3]
prompt = f"<question> {sample['question']} <context> {sample['content'][:300]} <answer>"

inputs = tokenizer(
    prompt,
    return_tensors='pt',
    padding='max_length',
    max_length=MAX_LEN
).to(DEVICE)

model.eval()
generated_ids = []
input_ids = inputs['input_ids']
# Using top-k sampling for variety of token
top_k = 100
for _ in range(MAX_LEN):
    with torch.no_grad():
        logits = model(input_ids)
    next_token_logits = logits[:, -1, :]
    
    topk_logits, topk_indices = torch.topk(next_token_logits, k=top_k, dim=-1)
    probs = F.softmax(topk_logits, dim=-1)
    sampled_index = torch.multinomial(probs, num_samples=1)
    next_token_id = topk_indices.gather(-1, sampled_index)
    generated_ids.append(next_token_id.item())
    if next_token_id[0].item() == tokenizer.eos_token_id:
        break

print("Prompt: ", prompt)

answer = tokenizer.decode(generated_ids, skip_special_tokens=True)
print("\nGenerated Answer:", answer)

print("\nReal Answer:", sample['answer'])

Prompt:  <question> What are some of the key publications and contributions of Sudha R. Shenoy in the field of economic history? <context> # About The Author

![553_Image_0.Png](553_Image_0.Png)  
Sudha R. Shenoy (1943–2008) was a lecturer in economic history at the University of Newcastle, Australia. She has held visiting posts at California State University, Hayward; Ohio University, Athens; George Mason University; and the Mises Ins <answer>

Generated Answer:  and for into of, on, of it them. the may, market leading a and. the of, thes. in between. and value to time are, and, ins prices- from to and its in of and the. is the be the than. individual and of's the as of,. of interest in capital and, production. and system, that this that and Additionally this to.. they. and may. with and is significant a also than economic to is or it the for social This the and. have that. in prices and the as not for with is the which, for.', or their in that ' significant and they that. are society