Dataset Loading

In [2]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset('csv', data_files='../dataset/QA_data.csv')
print(dataset)
dataset = dataset['train']

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer', 'Answer_Start', 'Answer_End'],
        num_rows: 197379
    })
})


In [3]:
def remove_empty_rows(example):
    return all([
        example['Question'] and example['Question'].strip(),
        example['Context'] and example['Context'].strip(),
        example['Answer'] and example['Answer'].strip()
    ])

dataset = dataset.filter(remove_empty_rows)

In [4]:
dataset = dataset.shuffle(seed=42).select(range(len(dataset) // 1000))

In [5]:
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer', 'Answer_Start', 'Answer_End'],
        num_rows: 127
    })
    test: Dataset({
        features: ['Question', 'Context', 'Answer', 'Answer_Start', 'Answer_End'],
        num_rows: 32
    })
})


Formatting for T5

Dynamic Padding and Collation

In [2]:
from torch.utils.data import DataLoader
import torch
def collate_fn(batch):
    # Dynamically pad sequences
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
        'labels': torch.tensor(labels, dtype=torch.long)
    }

# Create DataLoader
train_loader = DataLoader(processed_dataset['train'], batch_size=8, collate_fn=collate_fn)

NameError: name 'processed_dataset' is not defined

 Training Pipeline

Model Initialization

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'torch' is not defined

Optimizer and Scheduler

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=5, num_training_steps=40
)

Loss Function

In [None]:
from torch.nn import CrossEntropyLoss

# Loss function
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

Training Script

In [10]:
from torch.utils.data import DataLoader
from tqdm import tqdm

# Training loop
model.train()
for epoch in range(3):  # Adjust epochs as needed
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        loss.backward()
        
        # Gradient accumulation
        optimizer.step()
        lr_scheduler.step()
        
        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

: 

Distributed and Mixed-Precision Training

Using Accelerators

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from transformers import get_scheduler
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset

# Load dataset
dataset = load_dataset('csv', data_files='/kaggle/input/dataset/add_answer_data.csv')['train']

def remove_empty_rows(example):
    return all([
        example['Question'] and example['Question'].strip(),
        example['Context'] and example['Context'].strip(),
        example['Answer'] and example['Answer'].strip()
    ])

dataset = dataset.filter(remove_empty_rows)
dataset = dataset.shuffle(seed=42).select(range(len(dataset) // 100))
dataset = dataset.train_test_split(test_size=0.2)

# Tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base") 
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")

# Move model to GPU and wrap in DataParallel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = nn.DataParallel(model)
model = model.to(device)

# Preprocessing
def preprocess(example):
    inputs = f"question: {example['Question']} context: {example['Context']}"
    targets = example['Answer']
    tokenized_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    tokenized_targets = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    return {
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': tokenized_targets['input_ids']
    }

processed_dataset = dataset.map(preprocess)

# Collate function
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long),
        'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
        'labels': torch.tensor(labels, dtype=torch.long)
    }

train_loader = DataLoader(processed_dataset['train'], batch_size=8, collate_fn=collate_fn)

# Optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=10000)

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training loop
model.train()
for epoch in range(3):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss.mean() 
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())


In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path = "./t5-base-qa"  # folder đã unzip

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [7]:
def generate_answer(question, context):
    input_text = f"question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    output = model.generate(**inputs, max_length=128, num_beams=4)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test thử
question = "Mang thai tử cung bị hở không dùng thuốc ngay có sao không?"
context = """ """
print(generate_answer(question, context))


Mang thai tử cung bị hở không dùng thuốc ngay có sao không? context: Mang thai tử cung bị hở không dùng thuốc ngay có sao không?


In [22]:
import google.generativeai as genai

# Đặt API Key
genai.configure(api_key="AIzaSyAPJ2Lov11dutVcpITE_61GiOn8ZXVSu14")

# Tạo model Gemini (Gemini 2.0 Flash)
model = genai.GenerativeModel("gemini-2.0-flash")

def rewrite_question_with_gemini(history, question):
    prompt = f"""
    Bạn là trợ lý AI. Dựa vào lịch sử hội thoại và câu hỏi hiện tại, hãy thực hiện 2 nhiệm vụ:

    1. Viết lại câu hỏi sao cho đầy đủ, rõ ràng, có thể hiểu được mà không cần xem lại lịch sử.
    2. Phân loại câu hỏi đó vào một trong các nhóm sau:
       - y tế
       - realtime (thời gian thực, ví dụ: thời tiết, giá cả, lịch sự kiện,...)
       - khác

    Trả lời kết quả theo định dạng JSON như sau:
    {{
        "rewritten_question": "<câu hỏi đã viết lại>",
        "category": "<y tế | realtime | khác>"
    }}

    Lịch sử hội thoại: "{history}"
    Câu hỏi hiện tại: "{question}"
    """

    response = model.generate_content(prompt)
    return response.text.strip()


In [26]:
history = "OKe, cảm ơn bạn"
question = "bạn có biết hôm nay là ngày mấy không"

print(rewrite_question_with_gemini(history, question))


```json
{
  "rewritten_question": "Hôm nay là ngày tháng năm nào?",
  "category": "realtime"
}
```
