In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch.optim as optim
import pandas as pd

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomDataset(Dataset):
    def __init__(self, filepath, word2idx, pos2idx, tag2idx):
        self.word2idx = word2idx
        self.pos2idx = pos2idx
        self.tag2idx = tag2idx
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(filepath)

    def load_data(self, filepath):
        sentences, pos_tags, ner_tags = [], [], []
        sentence, pos_seq, ner_seq = [], [], []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split("\t")
                    if len(parts) >= 3:
                        word, pos, ner = parts
                        sentence.append(word)
                        pos_seq.append(pos)
                        ner_seq.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_seq)
                        ner_tags.append(ner_seq)
                        sentence, pos_seq, ner_seq = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_seq)
                ner_tags.append(ner_seq)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        word_ids = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in self.sentences[idx]]
        pos_ids = [self.pos2idx.get(p, self.pos2idx["<UNK>"]) for p in self.pos_tags[idx]]
        tag_ids = [self.tag2idx.get(t, self.tag2idx["<UNK>"]) for t in self.ner_tags[idx]]
        return {
                "input_ids": word_ids,
                "pos_ids": pos_ids,
                "ner_ids": tag_ids}

def collate_fn(batch):
    input_ids = [sample["input_ids"] for sample in batch]
    pos_ids = [sample["pos_ids"] for sample in batch]
    ner_ids = [sample["ner_ids"] for sample in batch]

    max_len = max(len(seq) for seq in input_ids)
    pad = 0

    padded_input_ids = [seq + [pad] * (max_len - len(seq)) for seq in input_ids]
    padded_pos_ids   = [seq + [pad] * (max_len - len(seq)) for seq in pos_ids]
    padded_ner_ids   = [seq + [pad] * (max_len - len(seq)) for seq in ner_ids]
    attention_masks  = [[1 if i < len(seq) else 0 for i in range(max_len)] for seq in input_ids]

    return (
        torch.tensor(padded_input_ids, dtype=torch.long),
        torch.tensor(padded_pos_ids, dtype=torch.long),
        torch.tensor(padded_ner_ids, dtype=torch.long),
        torch.tensor(attention_masks, dtype=torch.bool),
    )




In [8]:
train_path = "/content/drive/MyDrive/Datasets/train_v5.conll"
val_path = "/content/drive/MyDrive/Datasets/val_v5.conll"
test_path = "/content/drive/MyDrive/Datasets/test_v5.conll"

Load Dataset

In [5]:
#Force reinstall compatible versions
# !pip install gensim
# !pip install numpy==1.24.3 --force-reinstall
# !pip install pytorch-crf



In [6]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Datasets/cc.my.300.vec', binary=False)
# https://fasttext.cc/docs/en/crawl-vectors.html choose Burmese choose text .vec file

In [14]:
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0, "<UNK>": 1}

train_data = CustomDataset(train_path, vocab, pos_tag_to_ix, ner_tag_to_ix)
val_data = CustomDataset(val_path, vocab, pos_tag_to_ix, ner_tag_to_ix)
test_data = CustomDataset(test_path, vocab, pos_tag_to_ix, ner_tag_to_ix)

In [15]:
# Build vocab and tag mappings from datasets
for dataset in [train_data, val_data, test_data]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)

# Load FastText embeddings
embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))  # init with zeros

for word, idx in vocab.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]
    elif word == "":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Convert to torch tensor
fasttext_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)

# Reverse lookup for decoding
ix_to_ner_tag = {v: k for k, v in ner_tag_to_ix.items()}
ix_to_pos_tag = {v: k for k, v in pos_tag_to_ix.items()}

In [16]:
for dataset in [train_data, val_data, test_data]:
    dataset.word2idx = vocab
    dataset.tag2idx = ner_tag_to_ix

In [17]:
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)
print(vocab_size)
print(num_ner_tags)

19304
27


In [18]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from TorchCRF import CRF 

class TransformerNER(nn.Module):
    def __init__(self, embedding_matrix, num_ner_tags, num_pos_tags, dropout=0.3, num_layers=2):
        super(TransformerNER, self).__init__()
        dim = embedding_matrix.shape[1]
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.embedding_dropout = nn.Dropout(dropout)

        self.position_embedding = nn.Embedding(512, dim)  # max_len = 512

        encoder_layer = TransformerEncoderLayer(
            d_model=dim,
            nhead=6,
            dim_feedforward=1024,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.ner_classifier = nn.Linear(dim, num_ner_tags)
        self.pos_classifier = nn.Linear(dim, num_pos_tags)
        self.crf_ner = CRF(num_ner_tags, batch_first=True)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.embedding_dropout(x)

        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = x + self.position_embedding(positions)

        # PyTorch expects padding mask as True for pad
        x = self.encoder(x, src_key_padding_mask=~mask)

        ner_logits = self.ner_classifier(x)
        pos_logits = self.pos_classifier(x)
        return ner_logits, pos_logits

    def loss(self, ner_emissions, ner_tags, pos_logits, pos_tags, mask, alpha=1.0, ner_weights=None, pos_weights=None):
        # NER loss (weighted token-level CrossEntropy if weights provided, otherwise CRF)
        ner_logits_flat = ner_emissions.view(-1, ner_emissions.size(-1))
        ner_targets_flat = ner_tags.view(-1)

        if ner_weights is not None:
            token_loss = F.cross_entropy(ner_logits_flat, ner_targets_flat, weight=ner_weights, ignore_index=0)
        else:
            token_loss = -self.crf_ner(ner_emissions, ner_tags, mask=mask, reduction='mean')

        # POS loss (always CrossEntropy)
        pos_logits_flat = pos_logits.view(-1, pos_logits.size(-1))
        pos_targets_flat = pos_tags.view(-1)

        ce_loss = F.cross_entropy(pos_logits_flat, pos_targets_flat, weight=pos_weights, ignore_index=0)

        return token_loss + alpha * ce_loss, token_loss.item(), ce_loss.item()

    def decode(self, ner_emissions, mask):
        return self.crf_ner.decode(ner_emissions, mask=mask)



In [24]:
model = TransformerNER(
    embedding_matrix=fasttext_embeddings,
    num_ner_tags=len(ner_tag_to_ix),
    num_pos_tags=len(pos_tag_to_ix)
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

In [28]:
num_epochs = 15
alpha = 1.0

for epoch in range(num_epochs):
    model.train()
    total_loss, total_ner_loss, total_pos_loss = 0.0, 0.0, 0.0

    for input_ids, pos_ids, ner_ids, attention_mask in train_loader:
        input_ids = input_ids.to(device)
        pos_ids = pos_ids.to(device)
        ner_ids = ner_ids.to(device)
        attention_mask = attention_mask.to(device)

        optimizer.zero_grad()

        ner_logits, pos_logits = model(input_ids, mask=attention_mask)
        loss, ner_loss_val, pos_loss_val = model.loss(
            ner_logits,
            ner_ids,
            pos_logits,
            pos_ids,
            mask=attention_mask,
            alpha=alpha
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_ner_loss += ner_loss_val
        total_pos_loss += pos_loss_val

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss, val_ner_loss, val_pos_loss = 0.0, 0.0, 0.0
    with torch.no_grad():
        for input_ids, pos_ids, ner_ids, attention_mask in val_loader:
            input_ids = input_ids.to(device)
            pos_ids = pos_ids.to(device)
            ner_ids = ner_ids.to(device)
            attention_mask = attention_mask.to(device)

            ner_logits, pos_logits = model(input_ids, mask=attention_mask)
            loss, ner_loss_val, pos_loss_val = model.loss(
                ner_logits,
                ner_ids,
                pos_logits,
                pos_ids,
                mask=attention_mask,
                alpha=alpha
            )

            val_loss += loss.item()
            val_ner_loss += ner_loss_val
            val_pos_loss += pos_loss_val

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {avg_train_loss:.4f} | NER: {total_ner_loss / len(train_loader):.4f}, POS: {total_pos_loss / len(train_loader):.4f}")
    print(f"  Val   Loss: {avg_val_loss:.4f} | NER: {val_ner_loss / len(val_loader):.4f}, POS: {val_pos_loss / len(val_loader):.4f}")


  output = torch._nested_tensor_from_mask(


Epoch 1/15
  Train Loss: 7.4835 | NER: 5.9346, POS: 1.5489
  Val   Loss: 4.4137 | NER: 3.2585, POS: 1.1552
Epoch 2/15
  Train Loss: 4.1091 | NER: 3.0800, POS: 1.0291
  Val   Loss: 3.1656 | NER: 2.4054, POS: 0.7602
Epoch 3/15
  Train Loss: 3.0745 | NER: 2.3299, POS: 0.7446
  Val   Loss: 2.6280 | NER: 2.0581, POS: 0.5700
Epoch 4/15
  Train Loss: 2.5071 | NER: 1.9126, POS: 0.5945
  Val   Loss: 2.4234 | NER: 1.9128, POS: 0.5105
Epoch 5/15
  Train Loss: 2.1389 | NER: 1.6343, POS: 0.5046
  Val   Loss: 2.1296 | NER: 1.7108, POS: 0.4188
Epoch 6/15
  Train Loss: 1.8638 | NER: 1.4217, POS: 0.4421
  Val   Loss: 1.9795 | NER: 1.6170, POS: 0.3625
Epoch 7/15
  Train Loss: 1.6410 | NER: 1.2496, POS: 0.3914
  Val   Loss: 1.7871 | NER: 1.4720, POS: 0.3151
Epoch 8/15
  Train Loss: 1.4545 | NER: 1.1042, POS: 0.3503
  Val   Loss: 1.8215 | NER: 1.5194, POS: 0.3022
Epoch 9/15
  Train Loss: 1.3295 | NER: 1.0098, POS: 0.3196
  Val   Loss: 1.6818 | NER: 1.4226, POS: 0.2592
Epoch 10/15
  Train Loss: 1.1727 | NE

In [29]:
from sklearn.metrics import classification_report, f1_score, accuracy_score

def evaluate_ner(model, test_loader, ner_tag_to_ix, device=device):
    model.eval()
    ix_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sentences, pos_tags, ner_tags, attention_mask in test_loader:
            sentences = sentences.to(device)
            ner_tags = ner_tags.to(device)
            attention_mask = attention_mask.to(device).bool()

            ner_emissions, _ = model(sentences, mask=attention_mask)
            predictions = model.decode(ner_emissions, mask=attention_mask)

            for i in range(len(predictions)):
                true_len = attention_mask[i].sum().item()
                pred_seq = predictions[i][:true_len]
                true_seq = ner_tags[i][:true_len].cpu().tolist()

                all_preds.extend([ix_to_ner[p] for p in pred_seq])
                all_labels.extend([ix_to_ner[t] for t in true_seq])

    print("\n🧾 NER Classification Report:")
    print(classification_report(all_labels, all_preds, digits=4, zero_division=0))

    macro_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)
    acc = accuracy_score(all_labels, all_preds)

    return all_labels, all_preds


In [54]:
# Save only the model's parameters
torch.save({
    "model_state_dict": model.state_dict(),
    "vocab": vocab,
    "pos_tag_to_ix": pos_tag_to_ix,
    "ner_tag_to_ix": ner_tag_to_ix,
}, "TransformerEncoding_model_ver1.pt")


#### Test Evaluation

In [34]:
from sklearn.metrics import classification_report
import torch

def evaluate_model(model, dataloader, ner_tag_to_ix, device):
    model.eval()
    ix_to_ner_tag = {v: k for k, v in ner_tag_to_ix.items()}

    all_preds = []
    all_targets = []

    with torch.no_grad():
        for input_ids, pos_ids, ner_ids, attention_mask in dataloader:
            input_ids = input_ids.to(device)
            ner_ids = ner_ids.to(device)
            attention_mask = attention_mask.to(device)

            ner_logits, _ = model(input_ids, mask=attention_mask)
            predictions = model.decode(ner_logits, mask=attention_mask)

            # Flatten predictions and labels
            for i in range(len(input_ids)):
                length = attention_mask[i].sum().item()
                true_seq = ner_ids[i][:length].tolist()
                pred_seq = predictions[i][:length]

                all_targets.extend([ix_to_ner_tag[ix] for ix in true_seq])
                all_preds.extend([ix_to_ner_tag[ix] for ix in pred_seq])

    print("NER Evaluation Report:")
    print(classification_report(all_targets, all_preds, digits=4, zero_division=0))


In [35]:
evaluate_model(model, test_loader, ner_tag_to_ix, device)


NER Evaluation Report:
              precision    recall  f1-score   support

      B-DATE     0.8548    0.8030    0.8281        66
       B-LOC     0.9760    0.9645    0.9702      1182
       B-NUM     0.2500    0.2667    0.2581        15
       B-ORG     0.7097    0.4583    0.5570        48
       B-PER     0.6522    0.8824    0.7500        34
      B-TIME     0.4667    0.7778    0.5833         9
      E-DATE     0.8281    0.8030    0.8154        66
       E-LOC     0.9795    0.9679    0.9736      1182
       E-NUM     0.2500    0.2667    0.2581        15
       E-ORG     0.5641    0.4583    0.5057        48
       E-PER     0.6750    0.7941    0.7297        34
      E-TIME     0.5714    0.8889    0.6957         9
      I-DATE     0.7727    0.8947    0.8293        38
       I-LOC     0.9839    0.9702    0.9770       503
       I-ORG     0.4667    0.3590    0.4058        39
      I-TIME     0.0000    0.0000    0.0000         0
           O     0.9873    0.9886    0.9879     21324
    

In [37]:
def predict_single_sentence(model, sentence, word2idx, ner_idx2tag, device, max_len=128):
    model.eval()

    # Token to index
    input_ids = [word2idx.get(word, word2idx["<UNK>"]) for word in sentence]
    length = len(input_ids)

    # Pad
    input_ids += [word2idx["<PAD>"]] * (max_len - length)
    attention_mask = [1]*length + [0]*(max_len - length)

    # To tensor
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    mask_tensor = torch.tensor([attention_mask], dtype=torch.bool).to(device)

    # Predict
    with torch.no_grad():
        ner_logits, _ = model(input_tensor, mask=mask_tensor)
        predictions = model.decode(ner_logits, mask=mask_tensor)[0][:length]  # only first row, no padding

    # Convert to tag names
    tag_names = [ner_idx2tag[tag_id] for tag_id in predictions]

    return list(zip(sentence, tag_names))


In [58]:
# Example sentence
sentence = ["၁၉၄၈", "ခုနှစ်", "ဇန်နဝါရီ", "လ", "၄", "ရက်နေ့တွင်", "မြန်မာ", "နိုင်ငံ", "လွတ်လပ်ခြင်း", "ရရှိခဲ့သည်။"]

# Run prediction
results = predict_single_sentence(model, sentence, vocab, ix_to_ner_tag, device)

# Print
for word, tag in results:
    print(f"{word:15} → {tag}")


၁၉၄၈            → B-DATE
ခုနှစ်          → I-DATE
ဇန်နဝါရီ        → I-DATE
လ               → I-DATE
၄               → I-DATE
ရက်နေ့တွင်      → O
မြန်မာ          → B-LOC
နိုင်ငံ         → E-LOC
လွတ်လပ်ခြင်း    → O
ရရှိခဲ့သည်။     → O


END