In [None]:
from google.colab import drive
drive.mount('/content/drive')
#/content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets



**1. Import Necessaries**

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
import os
import logging

#Data Loading and Preprocessing

In [None]:
data_path = '/content/drive/MyDrive/ChatBox/data/processed_output.csv'  # Update with your data path
df  = pd.read_csv(data_path)

df .head()

# Prepare data: Assume df['Content'] is the column containing conversation data
train_texts = df['Content'].tolist()

# Split data into train and validation sets
from sklearn.model_selection import train_test_split
train_texts, val_texts = train_test_split(train_texts, test_size=0.1, random_state=42)


#Tokenization

In [None]:
# Initialize BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#Prepare Data Loaders

In [None]:
class ConversationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Create dataset instances
train_dataset = ConversationDataset(train_encodings)
val_dataset = ConversationDataset(val_encodings)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)  # Thay đổi batch_size tùy thuộc vào bộ nhớ GPU
val_loader = DataLoader(val_dataset, batch_size=4)

#Initialize the BART Model

In [None]:
# Chọn thiết bị (GPU nếu có, nếu không thì sử dụng CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base', use_cache=False).to(device)

# Set model to training mode
model.train()

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): Laye

#Define Training Loop

In [None]:
# Tham số huấn luyện
epochs = 3
logging_steps = 50
accumulation_steps = 1
scaler = GradScaler()



In [None]:
# Step 6: Define Training Loop

# Cài đặt optimizer và scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
total_steps = len(train_loader) * epochs  # Tổng số bước huấn luyện

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def compute_loss(logits, labels):
    loss_fct = CrossEntropyLoss()
    return loss_fct(logits.view(-1, model.config.vocab_size), labels.view(-1))



#Train the Model

In [None]:
# Vòng lặp huấn luyện
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    model.train()
    epoch_iterator = tqdm(train_loader, desc="Iteration")

    for step, batch in enumerate(epoch_iterator):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            model.zero_grad()

        if step % logging_steps == 0:
            print(f'Step {step}: Loss {loss.item()}')

    # Đánh giá trên tập validation
    model.eval()
    eval_loss = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
            eval_loss += outputs.loss.item()

    eval_loss /= len(val_loader)
    print(f'Validation Loss: {eval_loss}')

    # # Dọn dẹp bộ nhớ
    # gc.collect()
    # torch.cuda.empty_cache()

Epoch 1/3


Iteration:   1%|▏         | 1/67 [00:59<1:05:57, 59.96s/it]

Step 0: Loss 0.34540289640426636


Iteration:  76%|███████▌  | 51/67 [37:32<11:48, 44.27s/it]

Step 50: Loss 0.0022737684193998575


Iteration: 100%|██████████| 67/67 [49:18<00:00, 44.16s/it]


Validation Loss: 0.06065468109829908
Epoch 2/3


Iteration:   1%|▏         | 1/67 [00:45<49:54, 45.37s/it]

Step 0: Loss 0.07900087535381317


Iteration:  76%|███████▌  | 51/67 [37:08<11:36, 43.53s/it]

Step 50: Loss 0.002710364991798997


Iteration:  79%|███████▉  | 53/67 [38:33<10:02, 43.01s/it]

#Save Model

In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')
tokenizer.save_pretrained('/content/drive/MyDrive/ChatBox/model/fine_tuned_bart')

#Generate Response

In [None]:
# Set the maximum length for input sequences
max_length = 512

def generate_response(input_text: str) -> str:
    """
    Generate a response for the input text.

    Args:
        input_text (str): Input text for generating the response.

    Returns:
        str: The generated response by the Chatbot.
    """
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt',
                                 truncation=True, padding='longest',
                                 max_length=max_length)
    input_ids = input_ids.to(device)

    with torch.no_grad():
        # Generate output using top-k and top-p sampling
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=5,           # Increase number of beams for more diverse results
            no_repeat_ngram_size=2, # Avoid repeating n-grams
            num_return_sequences=1, # Number of sequences to return
            early_stopping=True,
            temperature=0.7,       # Control randomness
            top_k=50,              # Limit sampling pool
            top_p=0.95             # Nucleus sampling
        )

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Test the generate_response function
input_text = "dấu hiệu bệnh nhiễm trùng"
response = generate_response(input_text)
print("Input:", input_text)
print("Response:", response)

#Plot Training Performance

In [None]:
"""
## Plot Training Performance

Visualize the loss evolution over epochs and batches.
"""

# Dummy data for epoch and batch losses (replace with actual training data)
epoch_losses = [0.9, 0.7, 0.5, 0.3]
batch_losses = [0.95, 0.85, 0.75, 0.65, 0.6, 0.55, 0.5, 0.45]

def plot_performance(show_epoch_loss: bool = True, show_batch_loss: bool = True):
    """
    Plot the training performance.

    Args:
        show_epoch_loss (bool, optional): Whether to plot the epoch-wise average loss. Defaults to True.
        show_batch_loss (bool, optional): Whether to plot the batch-wise loss. Defaults to True.
    """
    if show_epoch_loss:
        plt.figure(figsize=(8, 6))
        plt.plot(range(1, len(epoch_losses) + 1), epoch_losses, marker='o', label='Epoch Loss')
        plt.xlabel("Epoch")
        plt.ylabel("Average Loss")
        plt.title("Loss Evolution over Epochs")
        plt.grid(True)
        plt.legend()
        plt.show()

    if show_batch_loss:
        plt.figure(figsize=(8, 6))
        plt.plot(range(1, len(batch_losses) + 1), batch_losses, marker='x', label='Batch Loss', color='orange')
        plt.xlabel("Batch")
        plt.ylabel("Loss")
        plt.title("Loss Evolution over Batches")
        plt.grid(True)
        plt.legend()
        plt.show()

# Plot the performance
plot_performance()