In [1]:
import pandas as pd
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch import nn
from torchcrf import CRF

import torch.nn.functional as F
from typing import Optional
from transformers import AutoModel, AutoTokenizer

import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

from shared_functions.gg_sheet_drive import *

### Support Modules

In [16]:
class PhoBertEmbedding(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", device=None, freeze=True, max_length = 85):
        super().__init__()
        
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        
        self.max_length = max_length
        
        if freeze:
            for p in self.model.parameters():
                p.requires_grad = False

    def encode(self, texts):
        toks = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length = self.max_length)
        input_ids = toks["input_ids"].to(self.device)
        attention_mask = toks["attention_mask"].to(self.device)
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        # last_hidden_state: (batch, seq_len, hidden)
        return outputs.last_hidden_state, attention_mask, toks
    
    #only different by name but for Module usage
    def forward(self, texts):
        toks = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length = self.max_length)
        input_ids = toks["input_ids"].to(self.device)
        attention_mask = toks["attention_mask"].to(self.device)

        with torch.set_grad_enabled(not self.model.training):
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

        return outputs.last_hidden_state, attention_mask, toks

In [3]:
class SimpleTokenizer:
    def __init__(self):
        self.pad_token = "<pad>"
        self.cls_token = "<cls>"
        self.sep_token = "<sep>"
        self.unk_token = "<unk>"
        self.pad_token_id = 0
        self.cls_token_id = 1
        self.sep_token_id = 2
        self.unk_token_id = 3

    def tokenize(self, text):
        return text.split()

    def encode(self, text, max_length=256, padding=True, truncation=True, return_tensors=None):
        tokens = [self.cls_token] + text.split()[: max_length - 2] + [self.sep_token]
        input_ids = list(range(len(tokens)))  # dummy token ids
        attention_mask = [1] * len(input_ids)

        if padding and len(input_ids) < max_length:
            pad_len = max_length - len(input_ids)
            input_ids += [self.pad_token_id] * pad_len
            attention_mask += [0] * pad_len

        if return_tensors == "pt":
            import torch
            input_ids = torch.tensor([input_ids])
            attention_mask = torch.tensor([attention_mask])

        return {"input_ids": input_ids, "attention_mask": attention_mask}

    def __call__(self, text, **kwargs):
        return self.encode(text, **kwargs)

In [4]:
use_phobert = True

if use_phobert:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
    print('PhoBERT enabled. Using PhoBERTEmbedder()')
else:
    tokenizer = SimpleTokenizer()
    phobert = None
    print("PhoBERT disabled. Using SimpleTokenizer().")

PhoBERT enabled. Using PhoBERTEmbedder()


In [5]:
class MaskedAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_out, mask):
        # lstm_out: (batch, seq_len, hidden_dim)
        scores = self.attn(lstm_out).squeeze(-1)
        scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=1).unsqueeze(-1)
        context = torch.sum(lstm_out * attn_weights, dim=1, keepdim=True)
        return lstm_out + context.expand_as(lstm_out)

In [19]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, tagset_size, hidden_dim=256, num_layers=2, dropout=0.25,
                 phobert_model_name="vinai/phobert-base-v2", freeze_phobert=True, max_length=85):
        super().__init__()

        # Use your PhoBertEmbedding wrapper
        self.phobert_embedding = PhoBertEmbedding(
            model_name=phobert_model_name,
            freeze=freeze_phobert,
            max_length=max_length
        )

        phobert_dim = self.phobert_embedding.model.config.hidden_size  # usually 768

        self.embedding_dropout = nn.Dropout(dropout)

        # BiLSTM over PhoBERT embeddings
        self.bilstm = nn.LSTM(
            input_size=phobert_dim,
            hidden_size=hidden_dim // 2,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        self.layer_norm = nn.LayerNorm(hidden_dim)

        # Optional attention mechanism (define separately)
        self.attention = MaskedAttention(hidden_dim)

        self.hidden_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        self.fc = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, texts=None, input_ids=None, attention_mask=None, tags=None):
        """
        You can either:
          - Pass raw 'texts' (strings or list of strings)
          - OR pass pre-tokenized 'input_ids' and 'attention_mask'
        """

        # Case 1: raw texts â€” use PhoBertEmbedding forward()
        if texts is not None:
            embeddings, attention_mask, toks = self.phobert_embedding(texts)

        # Case 2: already tokenized tensors
        else:
            outputs = self.phobert_embedding.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
            embeddings = outputs.last_hidden_state

        embeddings = self.embedding_dropout(embeddings)

        # LSTM layer
        lstm_out, _ = self.bilstm(embeddings)
        lstm_out = self.layer_norm(lstm_out)

        # Attention + projection
        lstm_out = self.attention(lstm_out, attention_mask)
        lstm_out = self.hidden_fc(lstm_out)

        # Linear to tag space
        emissions = self.fc(lstm_out)

        # Compute CRF loss or decode
        if tags is not None:
            log_likelihood = self.crf(emissions, tags, mask=attention_mask.bool())
            return -log_likelihood.mean()
        else:
            return self.crf.decode(emissions, mask=attention_mask.bool())

### Training

In [33]:
def parse_label_idx(x):
    if isinstance(x, str):
        # split on whitespace and cast each token to int
        return [int(i) for i in x.strip().split()]
    return x

df = gs_to_df_pandas('training_ner')

df["label_idx"] = df["label_idx"].apply(parse_label_idx)

In [35]:
from torch.utils.data import Dataset

class trainDataset(Dataset):
    def __init__(self, df, tokenizer, label2id, max_length=85):
        self.df = df
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["input_text"]

        encoded = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

        input_ids = encoded["input_ids"].squeeze(0)
        attention_mask = encoded["attention_mask"].squeeze(0)

        # Convert label indices to tensor and pad/truncate
        label_ids = torch.tensor(row["label_idx"], dtype=torch.long)
        if len(label_ids) < self.max_length:
            pad_len = self.max_length - len(label_ids)
            label_ids = torch.cat([label_ids, torch.zeros(pad_len, dtype=torch.long)])
        else:
            label_ids = label_ids[:self.max_length]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label_ids
        }

In [36]:
dataset = trainDataset(df, tokenizer, label_ids, max_length=85)

sample = dataset[0]
print(sample["input_ids"].shape)      # torch.Size([max_len])
print(sample["attention_mask"].shape) # torch.Size([max_len])
print(sample["labels"].shape)         # torch.Size([max_len])

torch.Size([85])
torch.Size([85])
torch.Size([85])


In [13]:
import json

with open('D:/Study/Education/Projects/Group_Project/rag_model/model/NER/artifact/label2idx.json', 'r') as f:
    label_ids = json.load(f)

In [38]:
from torch.utils.data import random_split, DataLoader

total_size = len(dataset)
test_size = int(0.2 * total_size)
train_size = total_size - test_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

model = BiLSTM_CRF(len(label_ids), freeze_phobert=True)
# for batch in dataloader:
#     input_ids = batch["input_ids"]
#     attention_mask = batch["attention_mask"]
#     labels = batch["labels"]
#     print(input_ids.shape, labels.shape)
#     break

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total/1e6:.1f}M, Trainable: {trainable/1e6:.1f}M")

Total params: 136.4M, Trainable: 1.4M


In [40]:
df.to_csv('training.csv')

In [39]:
from torch import optim
from torch.utils.data import DataLoader

# --- Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)  # 1e-3 is safe for frozen PhoBERT
num_epochs = 50
batch_size = 8  # adjust as you like

# Use DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model.train()

for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        tags = batch["labels"].to(device) 

        optimizer.zero_grad()

        # Forward pass (CRF expects tags for training)
        loss = model(input_ids=input_ids, attention_mask=attention_mask, tags=tags)

        loss.backward()

        # Gradient clipping (important for stability)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/50, Loss: 339.7435
Epoch 2/50, Loss: 233.1921


KeyboardInterrupt: 