New Upload

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch.optim as optim
import pandas as pd
import math

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomDataset(Dataset):
    def __init__(self, filepath, word2idx, pos2idx, tag2idx):
        self.word2idx = word2idx
        self.pos2idx = pos2idx
        self.tag2idx = tag2idx
        self.sentences, self.pos_tags, self.ner_tags = self.load_data(filepath)

    def load_data(self, filepath):
        sentences, pos_tags, ner_tags = [], [], []
        sentence, pos_seq, ner_seq = [], [], []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split("\t")
                    if len(parts) >= 3:
                        word, pos, ner = parts
                        sentence.append(word)
                        pos_seq.append(pos)
                        ner_seq.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        pos_tags.append(pos_seq)
                        ner_tags.append(ner_seq)
                        sentence, pos_seq, ner_seq = [], [], []
            if sentence:
                sentences.append(sentence)
                pos_tags.append(pos_seq)
                ner_tags.append(ner_seq)
        return sentences, pos_tags, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        word_ids = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in self.sentences[idx]]
        pos_ids = [self.pos2idx.get(p, self.pos2idx["<UNK>"]) for p in self.pos_tags[idx]]
        tag_ids = [self.tag2idx.get(t, self.tag2idx["<UNK>"]) for t in self.ner_tags[idx]]
        return word_ids, pos_ids, tag_ids


def collate_fn(batch):
    sentences, pos_tags, ner_tags = zip(*batch)
    max_len = max(len(s) for s in sentences)
    pad = 0

    padded_sentences = [s + [pad] * (max_len - len(s)) for s in sentences]
    padded_pos = [p + [pad] * (max_len - len(p)) for p in pos_tags]
    padded_ner = [t + [pad] * (max_len - len(t)) for t in ner_tags]
    attention_masks = [[1]*len(s) + [0]*(max_len - len(s)) for s in sentences]

    return (
        torch.tensor(padded_sentences, dtype=torch.long),
        torch.tensor(padded_pos, dtype=torch.long),
        torch.tensor(padded_ner, dtype=torch.long),
        torch.tensor(attention_masks, dtype=torch.bool),
    )



In [None]:
train_path = "/content/drive/MyDrive/Datasets/train_v5.conll"
val_path = "/content/drive/MyDrive/Datasets/val_v5.conll"
test_path = "/content/drive/MyDrive/Datasets/test_v5.conll"

Load Dataset

In [None]:
#Force reinstall compatible versions
# !pip install gensim
# !pip install numpy==1.24.3 --force-reinstall
# !pip install pytorch-crf

In [None]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Datasets/cc.my.300.vec', binary=False)
# https://fasttext.cc/docs/en/crawl-vectors.html choose Burmese choose text .vec file

In [None]:
# Create vocabulary and tag-to-index mappings
vocab = {"<PAD>": 0, "<UNK>": 1}
ner_tag_to_ix = {"<PAD>": 0, "<UNK>": 1}
pos_tag_to_ix = {"<PAD>": 0, "<UNK>": 1}

train_data = CustomDataset(train_path, vocab, pos_tag_to_ix, ner_tag_to_ix)
val_data = CustomDataset(val_path, vocab, pos_tag_to_ix, ner_tag_to_ix)
test_data = CustomDataset(test_path, vocab, pos_tag_to_ix, ner_tag_to_ix)


In [None]:
# Build vocab and tag mappings from datasets
for dataset in [train_data, val_data, test_data]:
    for sentence, pos_tags, ner_tags in zip(dataset.sentences, dataset.pos_tags, dataset.ner_tags):
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
        for ner_tag in ner_tags:
            if ner_tag not in ner_tag_to_ix:
                ner_tag_to_ix[ner_tag] = len(ner_tag_to_ix)
        for pos_tag in pos_tags:
            if pos_tag not in pos_tag_to_ix:
                pos_tag_to_ix[pos_tag] = len(pos_tag_to_ix)

# Load FastText embeddings
embedding_dim = 300
embedding_matrix = np.zeros((len(vocab), embedding_dim))  # init with zeros

for word, idx in vocab.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]
    elif word == "<PAD>":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# Convert to torch tensor
fasttext_embeddings = torch.tensor(embedding_matrix, dtype=torch.float32)

# Reverse lookup for decoding
ix_to_ner_tag = {v: k for k, v in ner_tag_to_ix.items()}
ix_to_pos_tag = {v: k for k, v in pos_tag_to_ix.items()}


In [None]:
for dataset in [train_data, val_data, test_data]:
    dataset.word2idx = vocab
    dataset.tag2idx = ner_tag_to_ix


In [None]:
hidden_dim = 256
vocab_size = len(vocab)
num_ner_tags = len(ner_tag_to_ix)
print(vocab_size)
print(num_ner_tags)

19304
27


In [None]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torchcrf import CRF

class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch_size, seq_len, d_model)
        x = x + self.pe[:x.size(1)].unsqueeze(0)  # Fix: broadcast shape correctly
        return x

class FeedForward(nn.Module):
    def __init__(self, dim, expension_factor, dropout):
        super(FeedForward, self).__init__()
        hidden_dim = dim * expension_factor
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout1(F.gelu(self.fc1(x)))
        return self.dropout2(self.fc2(x))

class Fourier(nn.Module):
    def __init__(self, dropout=0.3):
        super(Fourier, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, x):
        x = x.to(dtype=torch.float64)
        x = torch.fft.fft(x, dim=-1)
        x = torch.fft.fft(x, dim=1)
        x = self.act(x.real)
        x = x.to(dtype=torch.float32)
        x = self.dropout(x)
        return x

class FNetBlock(nn.Module):
    def __init__(self, dim, expension_factor, dropout):
        super(FNetBlock, self).__init__()
        self.fourier = Fourier(dropout)
        self.ffn = FeedForward(dim, expension_factor, dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        residual = x
        x = self.fourier(x)
        x = self.norm1(x + residual)
        residual = x
        x = self.ffn(x)
        return self.norm2(x + residual)

class FNetNER(nn.Module):
    def __init__(self, embedding_matrix, num_ner_tags, num_pos_tags, expension_factor=4, dropout=0.3, num_layers=5):
        super(FNetNER, self).__init__()
        dim = embedding_matrix.shape[1]
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.embedding_dropout = nn.Dropout(dropout)
        self.pos_en = PositionalEncoding(256, d_model=dim)

        self.encoder = nn.Sequential(*[
            FNetBlock(dim, expension_factor, dropout) for _ in range(num_layers)
        ])

        self.ner_classifier = nn.Linear(dim, num_ner_tags)
        self.pos_classifier = nn.Linear(dim, num_pos_tags)
        self.crf_ner = CRF(num_ner_tags, batch_first=True)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.embedding_dropout(x)
        x = self.pos_en(x)
        x = self.encoder(x)

        ner_logits = self.ner_classifier(x)
        pos_logits = self.pos_classifier(x)

        return ner_logits, pos_logits

    def loss(self, ner_emissions, ner_tags, pos_logits, pos_tags, mask, alpha=1.0, ner_weights=None, pos_weights=None):
        # NER: weighted token-level loss approximation
        ner_logits_flat = ner_emissions.view(-1, ner_emissions.size(-1))
        ner_targets_flat = ner_tags.view(-1)

        if ner_weights is not None:
            token_loss = F.cross_entropy(ner_logits_flat, ner_targets_flat, weight=ner_weights, ignore_index=0)
        else:
            token_loss = -self.crf_ner(ner_emissions, ner_tags, mask=mask, reduction='mean')

        # POS: CrossEntropy with weights
        pos_logits_flat = pos_logits.view(-1, pos_logits.size(-1))
        pos_targets_flat = pos_tags.view(-1)

        ce_loss = F.cross_entropy(pos_logits_flat, pos_targets_flat, weight=pos_weights, ignore_index=0)

        return token_loss + alpha * ce_loss, token_loss.item(), ce_loss.item()



    def decode(self, ner_emissions, mask):
        return self.crf_ner.decode(ner_emissions, mask=mask)


In [None]:
from collections import Counter
import numpy as np

def compute_class_weights(tag_sequences, tag_to_ix):
    tag_counts = Counter(tag for seq in tag_sequences for tag in seq)
    total_tags = sum(tag_counts.values())

    weights = []
    for tag, idx in tag_to_ix.items():
        if tag == "<PAD>":
            weights.append(0.0)
        else:
            tag_freq = tag_counts.get(tag, 1)
            weight = total_tags / (len(tag_to_ix) * tag_freq)
            weights.append(weight)
    return torch.tensor(weights, dtype=torch.float32)
ner_weights = compute_class_weights(train_data.ner_tags, ner_tag_to_ix)
pos_weights = compute_class_weights(train_data.pos_tags, pos_tag_to_ix)


In [None]:
model = FNetNER(embedding_matrix=fasttext_embeddings, num_ner_tags=len(ner_tag_to_ix),num_pos_tags=len(pos_tag_to_ix)).to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001,weight_decay=1e-5)

In [None]:
model

FNetNER(
  (embedding): Embedding(19304, 300)
  (embedding_dropout): Dropout(p=0.3, inplace=False)
  (pos_en): PositionalEncoding()
  (encoder): Sequential(
    (0): FNetBlock(
      (fourier): Fourier(
        (dropout): Dropout(p=0.3, inplace=False)
        (act): ReLU()
      )
      (ffn): FeedForward(
        (fc1): Linear(in_features=300, out_features=1200, bias=True)
        (fc2): Linear(in_features=1200, out_features=300, bias=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
      (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    )
    (1): FNetBlock(
      (fourier): Fourier(
        (dropout): Dropout(p=0.3, inplace=False)
        (act): ReLU()
      )
      (ffn): FeedForward(
        (fc1): Linear(in_features=300, out_features=1200, bias=True)
        (fc2): Linear(in_features=1200, out_features=300, bias=True)
        (drop

In [None]:
def train_model(model, train_loader, val_loader, optimizer, ner_weights, pos_weights, num_epochs=10, alpha=1.0, device=device):
    train_weights = {
        "ner": ner_weights.to(device),
        "pos": pos_weights.to(device)
    }

    for epoch in range(num_epochs):
        model.train()
        total_loss, total_ner_loss, total_pos_loss = 0.0, 0.0, 0.0

        for sentences, pos_tags, ner_tags, attention_mask in train_loader:
            sentences = sentences.to(device)
            pos_tags = pos_tags.to(device)
            ner_tags = ner_tags.to(device)
            attention_mask = attention_mask.to(device).bool()

            optimizer.zero_grad()

            ner_emissions, pos_logits = model(sentences, mask=attention_mask)
            loss, ner_loss_val, pos_loss_val = model.loss(
                ner_emissions, ner_tags, pos_logits, pos_tags, attention_mask,
                alpha=alpha,
                ner_weights=train_weights["ner"],
                pos_weights=train_weights["pos"]
            )

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_ner_loss += ner_loss_val
            total_pos_loss += pos_loss_val

        # Validation
        model.eval()
        val_loss, val_ner_loss, val_pos_loss = 0.0, 0.0, 0.0
        with torch.no_grad():
            for sentences, pos_tags, ner_tags, attention_mask in val_loader:
                sentences = sentences.to(device)
                pos_tags = pos_tags.to(device)
                ner_tags = ner_tags.to(device)
                attention_mask = attention_mask.to(device).bool()

                ner_emissions, pos_logits = model(sentences, mask=attention_mask)
                loss, ner_loss_val, pos_loss_val = model.loss(
                    ner_emissions, ner_tags, pos_logits, pos_tags, attention_mask,
                    alpha=alpha,
                    ner_weights=train_weights["ner"],
                    pos_weights=train_weights["pos"]
                )

                val_loss += loss.item()
                val_ner_loss += ner_loss_val
                val_pos_loss += pos_loss_val

        # Averages
        num_train_batches = len(train_loader)
        num_val_batches = len(val_loader)

        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss     : {total_loss / num_train_batches:.4f} "
              f"(NER: {total_ner_loss / num_train_batches:.4f}, POS: {total_pos_loss / num_train_batches:.4f})")
        print(f"  Validation Loss: {val_loss / num_val_batches:.4f} "
              f"(NER: {val_ner_loss / num_val_batches:.4f}, POS: {val_pos_loss / num_val_batches:.4f})")


In [None]:
epochs = 50
train_model(
    model,
    train_loader,
    val_loader,
    optimizer,
    ner_weights=ner_weights,
    pos_weights=pos_weights,
    num_epochs=epochs,
    alpha=0.5,
    device=device
)



Epoch 1/5
  Train Loss     : 4.4492 (NER: 3.1398, POS: 2.6188)
  Validation Loss: 4.2976 (NER: 3.0289, POS: 2.5374)

Epoch 2/5
  Train Loss     : 4.1782 (NER: 2.9327, POS: 2.4910)
  Validation Loss: 4.0353 (NER: 2.8149, POS: 2.4407)

Epoch 3/5
  Train Loss     : 3.8803 (NER: 2.6710, POS: 2.4186)
  Validation Loss: 4.1496 (NER: 2.9742, POS: 2.3507)

Epoch 4/5
  Train Loss     : 3.7174 (NER: 2.5302, POS: 2.3744)
  Validation Loss: 3.7570 (NER: 2.6005, POS: 2.3129)

Epoch 5/5
  Train Loss     : 3.5812 (NER: 2.4190, POS: 2.3243)
  Validation Loss: 3.3269 (NER: 2.1679, POS: 2.3181)


In [None]:
# Save only the model's parameters
torch.save(model.state_dict(), "MyanmarNER_ver_1.pth")

In [None]:
from sklearn.metrics import classification_report, f1_score, accuracy_score

def evaluate_ner(model, test_loader, ner_tag_to_ix, device="cuda"):
    model.eval()
    ix_to_ner = {v: k for k, v in ner_tag_to_ix.items()}

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sentences, pos_tags, ner_tags, attention_mask in test_loader:
            sentences = sentences.to(device)
            ner_tags = ner_tags.to(device)
            attention_mask = attention_mask.to(device).bool()

            ner_emissions, _ = model(sentences, mask=attention_mask)
            predictions = model.decode(ner_emissions, mask=attention_mask)

            for i in range(len(predictions)):
                true_len = attention_mask[i].sum().item()
                pred_seq = predictions[i][:true_len]
                true_seq = ner_tags[i][:true_len].cpu().tolist()

                all_preds.extend([ix_to_ner[p] for p in pred_seq])
                all_labels.extend([ix_to_ner[t] for t in true_seq])

    print("\n🧾 NER Classification Report:")
    print(classification_report(all_labels, all_preds, digits=4, zero_division=0))

    macro_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)
    acc = accuracy_score(all_labels, all_preds)

    return all_labels, all_preds


In [None]:
evaluate_ner(model, test_loader, ner_tag_to_ix, device)



🧾 NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.3019    0.2424    0.2689        66
       B-LOC     0.2748    0.2699    0.2723      1182
       B-NUM     0.0000    0.0000    0.0000        15
       B-ORG     0.0000    0.0000    0.0000        48
       B-PER     0.0000    0.0000    0.0000        34
      B-TIME     0.0000    0.0000    0.0000         9
      E-DATE     0.0407    0.3939    0.0738        66
       E-LOC     0.2653    0.4205    0.3254      1182
       E-NUM     0.0000    0.0000    0.0000        15
       E-ORG     0.0123    0.4167    0.0238        48
       E-PER     0.0000    0.0000    0.0000        34
      E-TIME     0.0040    0.5556    0.0080         9
      I-DATE     0.0000    0.0000    0.0000        38
       I-LOC     0.1419    0.4811    0.2191       503
       I-ORG     0.0000    0.0000    0.0000        39
           O     0.9674    0.4477    0.6121     21324
      S-DATE     0.4658    0.7727    0.5812        

(['O',
  'O',
  'O',
  'O',
  'S-NUM',
  'O',
  'O',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'E-LOC',
  'O',
  'O',
  'O',
  'O',
  'S-ORG',
  'O',
  'O',
  'S-PER',
  'S-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'S-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'E-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'E-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'E-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'I-LOC',
  'O',
  'E-LOC',
  'O',
  'B-LOC',
  'I-LOC',
  'O',

In [None]:
# Test sentence
test_sentence = ["အောင်ဆန်းစုကြည်", "သည်", "နိုင်ငံရေး", "ခေါင်းဆောင်", "ဖြစ်သည်။"]
word_ids = [vocab.get(word, vocab["<UNK>"]) for word in test_sentence]

# Convert to tensor and move to device
input_tensor = torch.tensor([word_ids], dtype=torch.long).to(device)  # Shape: [1, seq_len]
mask = torch.ones_like(input_tensor, dtype=torch.bool).to(device)     # All tokens are valid

model.eval()
with torch.no_grad():
    emissions = model(input_tensor)                 # [1, seq_len, num_tags]
    best_paths = model.decode(emissions, mask)      # List[List[int]], shape: [1, seq_len]
    predicted_ids = best_paths[0]                   # Extract prediction for the sentence

# Map tag indices back to tag strings
idx2tag = {idx: tag for tag, idx in ner_tag_to_ix.items()}
predicted_tags = [idx2tag[idx] for idx in predicted_ids]

# Print results
for word, tag in zip(test_sentence, predicted_tags):
    print(f"{word}\t{tag}")


အောင်ဆန်းစုကြည်	O
သည်	O
နိုင်ငံရေး	O
ခေါင်းဆောင်	O
ဖြစ်သည်။	O


In [None]:
from collections import Counter
all_tags = [tag for seq in train_data.ner_tags for tag in seq]
print(Counter(all_tags))


Counter({'O': 167547, 'B-LOC': 9395, 'E-LOC': 9395, 'I-LOC': 4015, 'S-NUM': 3882, 'S-PER': 1911, 'S-LOC': 991, 'S-DATE': 699, 'B-DATE': 599, 'E-DATE': 599, 'I-DATE': 388, 'B-ORG': 308, 'E-ORG': 308, 'B-PER': 281, 'E-PER': 281, 'I-ORG': 208, 'S-ORG': 184, 'B-NUM': 151, 'E-NUM': 151, 'B-TIME': 143, 'E-TIME': 143, 'S-TIME': 118, 'I-TIME': 92, 'I-NUM': 32, 'I-PER': 16})


END