In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import numpy as np
import torch.optim as optim
import pandas as pd
import math

In [3]:
class CustomDataset(Dataset):
    def __init__(self, filepath, word2idx, tag2idx):
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.sentences, self.ner_tags = self.load_data(filepath)

    def load_data(self, filepath):
        sentences, ner_tags = [], []
        sentence, ner_tag = [], []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split("\t")
                    if len(parts) >= 3:
                        word, _, ner = parts
                        sentence.append(word)
                        ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                        sentence, ner_tag = [], []
            # In case file does not end with a newline
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        word_ids = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in self.sentences[idx]]
        tag_ids = [self.tag2idx.get(t, self.tag2idx["<UNK>"]) for t in self.ner_tags[idx]]
        return word_ids, tag_ids

def collate_fn(batch):
    sentences, tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    pad_token = 0  # word2idx["<PAD>"]
    padded_sentences = [s + [pad_token] * (max_len - len(s)) for s in sentences]
    padded_tags = [t + [pad_token] * (max_len - len(t)) for t in tags]

    return torch.tensor(padded_sentences, dtype=torch.long), torch.tensor(padded_tags, dtype=torch.long)


In [4]:
def extract_unique_tags(filepath):
    tag_set = set()
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split("\t")
                if len(parts) >= 3:
                    tag = parts[2]
                    tag_set.add(tag)
    return sorted(tag_set)


In [5]:
train_path = "/content/drive/MyDrive/Datasets/train_v5.conll"
val_path = "/content/drive/MyDrive/Datasets/val_v5.conll"
test_path = "/content/drive/MyDrive/Datasets/test_v5.conll"

Load Dataset

In [7]:
# #Force reinstall compatible versions
# !pip install gensim
# !pip install numpy==1.24.3 --force-reinstall

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [6]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Datasets/cc.my.300.vec', binary=False)
# https://fasttext.cc/docs/en/crawl-vectors.html choose Burmese choose text .vec file

In [7]:
# Step 1: Start with PAD and UNK
word2idx = {"<PAD>": 0, "<UNK>": 1}
tag2idx = {"<PAD>": 0, "<UNK>": 1}

# Step 2: Load raw tokens from file before creating dataset
def build_vocab_from_file(filepath, word2idx, tag2idx):
    word_set = set()
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split("\t")
                if len(parts) >= 3:
                    word, _, tag = parts
                    word_set.add(word)
                    if tag not in tag2idx:
                        tag2idx[tag] = len(tag2idx)
    return word_set

# Build vocab from all datasets
vocab = set()
for path in [train_path, val_path, test_path]:
    vocab |= build_vocab_from_file(path, word2idx, tag2idx)

# Step 3: Prepare embedding matrix
embedding_dim = 300
embedding_matrix = []
embedding_matrix.append(np.zeros(embedding_dim))  # PAD
embedding_matrix.append(np.random.uniform(-0.25, 0.25, embedding_dim))  # UNK

for word in vocab:
    word2idx[word] = len(word2idx)
    if word in fasttext_model:
        embedding_matrix.append(fasttext_model[word])
    else:
        embedding_matrix.append(np.random.uniform(-0.25, 0.25, embedding_dim))

embedding_matrix = torch.tensor(embedding_matrix).float()

# Step 4: Now it's safe to create datasets
train_data = CustomDataset(train_path, word2idx, tag2idx)
val_data = CustomDataset(val_path, word2idx, tag2idx)
test_data = CustomDataset(test_path, word2idx, tag2idx)


train_load = DataLoader(train_data, batch_size=32,shuffle = True,collate_fn=collate_fn)
val_load = DataLoader(val_data, batch_size=32,shuffle = False,collate_fn=collate_fn)
test_load = DataLoader(test_data, batch_size=32,shuffle = False,collate_fn=collate_fn)


  embedding_matrix = torch.tensor(embedding_matrix).float()


In [8]:
from collections import Counter

# Flatten the list of lists into a single list of tags
all_tags = [tag for seq in train_data.ner_tags for tag in seq]

# Count the frequency of each tag
tag_counts = Counter(all_tags)

# Display the counts
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")


B-LOC: 9395
I-LOC: 4015
E-LOC: 9395
O: 167547
S-NUM: 3882
B-DATE: 599
I-DATE: 388
E-DATE: 599
S-PER: 1911
S-LOC: 991
S-DATE: 699
B-ORG: 308
E-ORG: 308
S-ORG: 184
I-ORG: 208
B-PER: 281
E-PER: 281
B-TIME: 143
E-TIME: 143
B-NUM: 151
E-NUM: 151
S-TIME: 118
I-TIME: 92
I-NUM: 32
I-PER: 16


In [9]:
for batch in train_load:
    sentences, tags = batch
    print("Batch max len:", sentences.shape)
    print("tags:", tags.shape)
    break


Batch max len: torch.Size([32, 34])
tags: torch.Size([32, 34])


In [32]:
class PositionalEncoding(nn.Module):
    def __init__(self,max_len,d_model):
        super().__init__()
        pe = torch.zeros(max_len,d_model)
        position = torch.arange(max_len).unsqueeze(1) # shape[max_len,1]
        div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *-(math.log(10000.0) / d_model)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        self.register_buffer('pe', pe)  # Register the positional encoding tensor as a buffer

    def forward(self,x):
        # x (batch_size,max_len, d_model)
        x = x + self.pe[:x.size(1)] #BroadCast positional encoding
        return x

In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    """
    Feedforward neural network with dropout and GELU activation.

    Parameters:
    - dim: Input dimension
    - expension_factor: Factor by which the hidden layer dimension will expand
    - dropout: Dropout probability for regularization
    """
    def __init__(self, dim, expension_factor, dropout):
        super(FeedForward, self).__init__()
        hidden_dim = dim * expension_factor
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout1(F.gelu(self.fc1(x)))  # Apply GELU and dropout on the first layer
        return self.dropout2(self.fc2(x))  # Apply second linear layer with dropout

class Fourier(nn.Module):
    """
    Fourier layer applying FFT twice (along two dimensions).

    Parameters:
    - dropout: Dropout probability
    """
    def __init__(self, dropout=0.3):
        super(Fourier, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, x):
        # Apply FFT along the last dimension (real and imaginary parts)
        x = x.to(dtype=torch.float64)
        x = torch.fft.fft(x, dim=-1)
        x = torch.fft.fft(x, dim=1)  # Apply FFT along the second dimension as well
        x = self.act(x.real)  # Using only real part
        x = x.to(dtype=torch.float32)
        x = self.dropout(x)
        return x

class FNet(nn.Module):
    """
    FNet model combining Fourier transformations and FeedForward layers.

    Parameters:
    - dim: Input dimension
    - expension_factor: Expansion factor for FeedForward layers
    - dropout: Dropout probability for regularization
    """
    def __init__(self, dim, expension_factor, dropout,embedding_matrix):

        super(FNet, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # Freeze embeddings
        self.pos_en = PositionalEncoding(512, d_model = dim)
        self.fourier = Fourier(dropout)
        self.ffn = FeedForward(dim, expension_factor, dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_en(x)
        residual = x
        x = self.fourier(x)
        x = self.norm1(x + residual)
        residual = x
        x = self.ffn(x)
        out = self.norm2(x + residual)
        return out

# Final NER model with classification head
class FNetNER(nn.Module):
    def __init__(self, embedding_matrix, num_tags, expension_factor=4, dropout=0.3):
        super(FNetNER, self).__init__()
        dim = embedding_matrix.shape[1]
        self.fnet = FNet(dim, expension_factor, dropout, embedding_matrix)
        self.fnet = FNet(dim, expension_factor, dropout, embedding_matrix)
        self.classifier = nn.Linear(dim, num_tags)

    def forward(self, x):
        x = self.fnet(x)  # [batch_size, seq_len, dim]
        logits = self.classifier(x)  # [batch_size, seq_len, num_tags]
        return logits


In [43]:
print(embedding_matrix.shape)
print(len(word2idx))
print((len(tag2idx)))

torch.Size([19304, 300])
19304
27


In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FNetNER(embedding_matrix, num_tags=len(tag2idx), expension_factor=2, dropout=0.3).to(device)
optimizer = optim.Adam(model.parameters(), lr =3e-4,weight_decay=1e-5)


In [45]:
from sklearn.utils.class_weight import compute_class_weight
labels = []
for tag_seq in train_data.ner_tags:
    labels.extend(tag_seq)


label_ids = [tag2idx[tag] for tag in labels]
present_classes = np.unique(label_ids)
present_weights = compute_class_weight(class_weight='balanced', classes=present_classes, y=label_ids)

full_weights = np.ones(len(tag2idx))
for i, cls in enumerate(present_classes):
    full_weights[cls] = present_weights[i]

weights = torch.tensor(full_weights, dtype=torch.float).to(device)

# Define loss
criterion = nn.CrossEntropyLoss(ignore_index=tag2idx["<PAD>"], weight=weights)


In [46]:
# Training loop
num_epochs = 50

for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for sentences, tags in train_load:
        sentences, tags = sentences.to(device), tags.to(device)

        optimizer.zero_grad()
        outputs = model(sentences)

        outputs = outputs.view(-1, outputs.shape[-1])  # [batch_size * seq_len, num_tags]
        tags = tags.view(-1)                           # [batch_size * seq_len]

        loss = criterion(outputs, tags)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_load)

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for sentences, tags in val_load:
            sentences, tags = sentences.to(device), tags.to(device)
            outputs = model(sentences)

            outputs = outputs.view(-1, outputs.shape[-1])
            tags = tags.view(-1)
            loss = criterion(outputs, tags)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_load)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")


Epoch 1/10 - Train Loss: 2.9270 - Val Loss: 2.5941
Epoch 2/10 - Train Loss: 2.6011 - Val Loss: 2.6141
Epoch 3/10 - Train Loss: 2.3805 - Val Loss: 2.5009
Epoch 4/10 - Train Loss: 2.2723 - Val Loss: 2.3990
Epoch 5/10 - Train Loss: 2.1743 - Val Loss: 2.4142
Epoch 6/10 - Train Loss: 2.0801 - Val Loss: 2.3069
Epoch 7/10 - Train Loss: 1.9978 - Val Loss: 2.3925
Epoch 8/10 - Train Loss: 1.8933 - Val Loss: 2.2898
Epoch 9/10 - Train Loss: 1.8610 - Val Loss: 2.3239
Epoch 10/10 - Train Loss: 1.7547 - Val Loss: 2.2332


In [49]:
#Test case
test_sentence = ["ကြက်ခြေနီ","မှ", "ပြော","ရေး","ဆို","ခွင့်","ရှိ","သူ", "MattCochrane","သည်။"]
word_ids = [word2idx.get(word, word2idx["<UNK>"]) for word in test_sentence]
input_tensor = torch.tensor([word_ids], dtype=torch.long).to(device)  # Shape: [1, seq_len]
model.eval()
with torch.no_grad():
    output = model(input_tensor)  # [1, seq_len, num_tags]
    predicted_ids = torch.argmax(output, dim=-1).squeeze(0).tolist()  # Remove batch dim
# Invert the tag2idx dictionary
idx2tag = {idx: tag for tag, idx in tag2idx.items()}
predicted_tags = [idx2tag[idx] for idx in predicted_ids]
for word, tag in zip(test_sentence, predicted_tags):
    print(f"{word}\t{tag}")


ကြက်ခြေနီ	B-DATE
မှ	E-DATE
ပြော	S-PER
ရေး	B-DATE
ဆို	E-DATE
ခွင့်	E-DATE
ရှိ	E-DATE
သူ	B-ORG
MattCochrane	E-DATE
သည်။	O


In [39]:
# Save only the model's parameters
torch.save(model.state_dict(), "ner_model.pth")

In [48]:
from sklearn.metrics import classification_report

def evaluate_model(model, dataloader, idx2tag, pad_idx, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Create mask on the fly
            mask = (inputs != pad_idx)

            logits = model(inputs)
            preds = torch.argmax(logits, dim=-1)

            for i in range(inputs.size(0)):
                true = labels[i][mask[i]].cpu().tolist()
                pred = preds[i][mask[i]].cpu().tolist()

                all_labels.extend([idx2tag[t] for t in true])
                all_preds.extend([idx2tag[p] for p in pred])

    print(classification_report(all_labels, all_preds, digits=4))


pad_idx = word2idx["<PAD>"]
evaluate_model(model, test_load, idx2tag, pad_idx, device)


              precision    recall  f1-score   support

      B-DATE     0.0301    0.2273    0.0532        66
       B-LOC     0.4880    0.4628    0.4750      1182
       B-NUM     0.0080    0.1333    0.0152        15
       B-ORG     0.0177    0.1458    0.0316        48
       B-PER     0.0097    0.0882    0.0174        34
      B-TIME     0.0110    0.3333    0.0214         9
      E-DATE     0.0295    0.3182    0.0539        66
       E-LOC     0.2705    0.5118    0.3539      1182
       E-NUM     0.0000    0.0000    0.0000        15
       E-ORG     0.0101    0.1042    0.0185        48
       E-PER     0.0158    0.3824    0.0304        34
      E-TIME     0.0039    0.6667    0.0078         9
      I-DATE     0.0000    0.0000    0.0000        38
       I-LOC     0.2410    0.8131    0.3718       503
       I-NUM     0.0000    0.0000    0.0000         0
       I-ORG     0.0019    0.0256    0.0036        39
       I-PER     0.0000    0.0000    0.0000         0
      I-TIME     0.0000    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


END