# Import Library

In [1]:
import numpy as np
import pandas as pd

from tokenizers import Tokenizer

import json
import os
import math
from tqdm.auto import tqdm

from torch.nn.modules.transformer import _generate_square_subsequent_mask
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

# Load DaTokenizer

In [2]:
tokenizer = Tokenizer.from_file("/kaggle/input/datokenizer/other/default/1/tokenizer.json")

# Sample use
encoded = tokenizer.encode("Viêm nang lông thường thấy khi da bị ...")
decoded = tokenizer.decode(encoded.ids, skip_special_tokens=False)

print("Encoded IDs:", encoded.ids)
print("Decoded Text:", decoded)

Encoded IDs: [1559, 1647, 1827, 999, 1267, 990, 997, 1025, 1821]
Decoded Text: Viêm nang lông thường thấy khi da bị ...


# Load CSV

In [3]:
df_train = pd.read_csv("/kaggle/input/dataset-for-dagpt/pretrain_data.csv")
df_train.head(2)

Unnamed: 0,text
0,"Theo báo cáo của Tổ chức Y tế Thế giới, tim mạ..."
1,Các serovars khác nhau của Chlamydia trachomat...


In [4]:
df_test = pd.read_csv("/kaggle/input/dataset-for-dagpt/pretrain_test.csv")
df_test.head(2)

Unnamed: 0,text
0,"Bệnh ghẻ, một căn bệnh da liễu khá phổ biến, t..."
1,Sinh thiết là một xét nghiệm quan trọng trong ...


# Preprocessing Data & Building Dataset

Data đã được xử lí một số trước đó, nên hiện tại ta chỉ thay kí tự xuống hàng thành ký tự đặc biệt `NEWLINE` và tokenize

In [5]:
def preprocessing_str(s):
    return tokenizer.encode(s.replace("\n","[NEWLINE]")).ids

In [6]:
class MyDataset(Dataset):
    def __init__(self, df, max_len=256):
        self.data = []
        self.label = []
        for text in df['text']:
            checked = True
            tokens = preprocessing_str(text)
            while len(tokens) <= max_len:
                checked = False
                tokens = tokens + [tokenizer.token_to_id("[PAD]")]
            for i in range(0, len(tokens)- max_len, max_len):
                self.data.append(tokens[i:i+max_len])
                self.label.append(tokens[i+1:i+1+max_len])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.long), torch.tensor(self.label[idx], dtype=torch.long)

In [7]:
data_train = MyDataset(df_train)
len(data_train)

83805

# Model Setup

In [8]:
class TokenEmbedding(nn.Module):
    
    def __init__(self, vocab_size:int, d_model:int):
        super(TokenEmbedding, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
    
    def forward(self, tokens: torch.Tensor):
        return self.embedding_layer(tokens.long()) * math.sqrt(self.d_model)
    
class PositionalEncoding(nn.Module):
    
    def __init__(self, max_length_seq:int, d_model:int, dropout_rate:float):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, d_model, 2)* math.log(10000) / d_model)
        pos = torch.arange(0, max_length_seq).reshape(max_length_seq, 1)
        PE = torch.zeros((max_length_seq, d_model))
        PE[:, 0::2] = torch.sin(pos * den)
        PE[:, 1::2] = torch.cos(pos * den)
        PE = PE.unsqueeze(-2)
        self.register_buffer("PE", PE)
        self.dropout_layer = nn.Dropout(dropout_rate)
        
    def forward(self, token_embedding: torch.Tensor):
        return self.dropout_layer(token_embedding + self.PE[:token_embedding.size(0), :])
    
class DecoderOnlyBlock(nn.Module):
    
    def __init__(self, d_model, num_heads, d_ff,dropout_rate=0.0, batch_first=True):
        super(DecoderOnlyBlock, self).__init__()
        
        # Masked Multihead Attention
        self.mmha = nn.MultiheadAttention(embed_dim=d_model, num_heads=num_heads, dropout=dropout_rate, batch_first=batch_first)
        
        # Normalize 1
        self.norm1 = nn.LayerNorm(normalized_shape= d_model, eps=1e-05, elementwise_affine=True, bias=True)
        
        # Feed forward
        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=d_ff, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=d_ff, out_features=d_model, bias=True),
        )
        
        # Normalize 2
        self.norm2 = nn.LayerNorm(normalized_shape= d_model, eps=1e-05, elementwise_affine=True, bias=True)
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
        
        
    def forward(self, x, attn_mask=None, key_padding_mask=None):
        # Block 1
        mmha_output, _ = self.mmha(query=x,
                                   key=x,
                                   value=x,
                                   key_padding_mask=key_padding_mask, need_weights=False, 
                                   attn_mask=attn_mask, 
                                   average_attn_weights=True, is_causal=False)
        
        x = self.norm1(x + self.dropout(mmha_output))
        
        # Block 2
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x

In [9]:
class DaGPT(nn.Module):
    def __init__(self, d_model, vocab_size, max_len_seq, num_heads, d_ff, num_of_block, dropout_rate=0.0, batch_first=True):
        super(DaGPT, self).__init__()
        self.token_embedding = TokenEmbedding(vocab_size, d_model)
        self.position_embedding = PositionalEncoding(max_len_seq, d_model, dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.decoder_blocks = nn.ModuleList([
            DecoderOnlyBlock(d_model=d_model, num_heads=num_heads, d_ff=d_ff,dropout_rate=dropout_rate, batch_first=batch_first) 
            for _ in range(num_of_block)
        ])
        
        self.linear_out = nn.Linear(in_features=d_model, out_features=vocab_size, bias=True)
        
    def forward(self, x, attn_mask=None, key_padding_mask=None):
        x = self.token_embedding(x)
        x = self.position_embedding(x)
        
        for block in self.decoder_blocks:
            x = block(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
            
        logits = self.linear_out(x)
        
        return logits  

In [10]:
model = DaGPT(d_model=768,
               vocab_size=tokenizer.get_vocab_size(),
               max_len_seq=256,
               num_heads=16,
               d_ff=3072, 
               num_of_block=24, 
               dropout_rate=0.1, 
               batch_first=True)

total_params = sum(p.numel() for p in model.parameters())
print(f"Tổng số tham số: {total_params}")

Tổng số tham số: 208100494


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

# Load model weight

In [12]:
model.load_state_dict(torch.load('/kaggle/input/dagpt-pretrain-epoch-1-5/model_weights.pth'))

  model.load_state_dict(torch.load('/kaggle/input/dagpt-pretrain-epoch-1-5/model_weights.pth'))


<All keys matched successfully>

# Greedy Inference Function

In [13]:
def greedy_inference(prompt, max_len_seq=256, max_len_gen=50, temperature=1):
    model.eval()
    result = []
    with torch.no_grad():
        tokens_idx = preprocessing_str(prompt)
        for i in range(max_len_gen):
            tgt = torch.tensor(tokens_idx, dtype=torch.long).unsqueeze(0).to(device)
            logits = model(tgt)
            probs = torch.softmax(logits[:,-1,:] / temperature, dim=-1)
            next_token = torch.argmax(probs, dim=-1).item()
            result.append(next_token)
            if len(tokens_idx) < max_len_seq:
                tokens_idx = tokens_idx+[next_token] 
            else:
                tokens_idx = tokens_idx[1:]+[next_token]
        return tokenizer.decode(result)

In [14]:
greedy_inference("Chốc là")

'một loại nấm gây nấm Candida . Nấm tóc , nấm tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc tóc'

# Model Training

In [15]:
batch_size = 20
max_len = 256
epochs = 5

# DataLoader
train_loader = DataLoader(data_train, batch_size=batch_size, shuffle=True)
len_loader = len(train_loader)  # Tính độ dài của DataLoader
total_steps = epochs * len_loader

# Optimize
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-8)

# Loss 
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("[PAD]"))

In [16]:
# Training
for epoch in range(epochs):
    model.train()

    total_loss = 0
    steps = 0

    for i, batch in tqdm(enumerate(train_loader), desc=f"EPOCH {epoch+1}/{epochs}"):
        data, label = batch
        data, label = data.to(device), label.to(device)
        data_padding_mask = (data == tokenizer.token_to_id("[PAD]")).to(torch.float)
        data_mmha_mask = _generate_square_subsequent_mask(data.size(1)).to(device)
        
        optimizer.zero_grad()

        logits = model(data, data_mmha_mask, data_padding_mask)
        
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        label = label.view(B*T)

        loss = criterion(logits, label)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        steps += 1
    
    final_loss = total_loss / steps
    print(f"Final Loss: {final_loss}")
    print("Test Greedy Inference: Theo báo cáo của | ",end="")
    print(greedy_inference("Theo báo cáo của"))

EPOCH 1/5: 0it [00:00, ?it/s]

Final Loss: 3.483508892771567
Test Greedy Inference: Theo báo cáo của | Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế . 1 . 2 . 2 . 2 . 2 . 2 . 2 . 2 . 2 . 4 . 4 .


EPOCH 2/5: 0it [00:00, ?it/s]

Final Loss: 3.3697676788335538
Test Greedy Inference: Theo báo cáo của | Bộ Y tế ban hành ban hành ban hành nghề dược , dược , dược , dược , dược , dược , dược , dược , dược liệu , dược liệu , dược liệu , dược liệu , dược liệu , dược liệu , dược liệu , dược liệu ,


EPOCH 3/5: 0it [00:00, ?it/s]

Final Loss: 3.274896780137827
Test Greedy Inference: Theo báo cáo của | Bộ Y tế ban hành Thông tư này , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ


EPOCH 4/5: 0it [00:00, ?it/s]

Final Loss: 3.194199988222498
Test Greedy Inference: Theo báo cáo của | Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế . 1 . 1 . 1 . 1 . 1


EPOCH 5/5: 0it [00:00, ?it/s]

Final Loss: 3.1231513422731854
Test Greedy Inference: Theo báo cáo của | Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế , Bộ Y tế . 1 . 1 . 1 . 1 . 1 . 1 . 2 . 2 . 2


# Save model weights

In [17]:
torch.save(model.state_dict(), '/kaggle/working/model_weights.pth')