In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import numpy as np

In [7]:
class CustomDataset(Dataset):
    def __init__(self, filepath, word2idx, tag2idx):
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.sentences, self.ner_tags = self.load_data(filepath)

    def load_data(self, filepath):
        sentences, ner_tags = [], []
        sentence, ner_tag = [], []
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    parts = line.split("\t")
                    if len(parts) >= 3:
                        word, _, ner = parts
                        sentence.append(word)
                        ner_tag.append(ner)
                else:
                    if sentence:
                        sentences.append(sentence)
                        ner_tags.append(ner_tag)
                        sentence, ner_tag = [], []
            # In case file does not end with a newline
            if sentence:
                sentences.append(sentence)
                ner_tags.append(ner_tag)
        return sentences, ner_tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        word_ids = [self.word2idx.get(w, self.word2idx["<UNK>"]) for w in self.sentences[idx]]
        tag_ids = [self.tag2idx.get(t, self.tag2idx["<UNK>"]) for t in self.ner_tags[idx]]
        return word_ids, tag_ids

def collate_fn(batch):
    sentences, tags = zip(*batch)
    max_len = max(len(s) for s in sentences)

    pad_token = 0  # word2idx["<PAD>"]
    padded_sentences = [s + [pad_token] * (max_len - len(s)) for s in sentences]
    padded_tags = [t + [pad_token] * (max_len - len(t)) for t in tags]

    return torch.tensor(padded_sentences, dtype=torch.long), torch.tensor(padded_tags, dtype=torch.long)


Load Dataset

In [8]:
train_path = "/content/drive/MyDrive/Datasets/train_v5.conll"
val_path = "/content/drive/MyDrive/Datasets/val_v5.conll"
test_path = "/content/drive/MyDrive/Datasets/test_v5.conll"

word2idx = {"<PAD>": 0, "<UNK>": 1}
tag2idx = {"<PAD>": 0, "<UNK>": 1}
train_data = CustomDataset(train_path,word2idx, tag2idx)
val_data = CustomDataset(val_path,word2idx, tag2idx)
test_data = CustomDataset(test_path,word2idx, tag2idx)



train_load = DataLoader(train_data, batch_size=32, collate_fn=collate_fn)


In [9]:
for batch in train_load:
    sentences, tags = batch
    print("Batch max len:", sentences.shape)
    print("tags:", tags.shape)
    break


Batch max len: torch.Size([32, 41])
tags: torch.Size([32, 41])


In [10]:
# #Force reinstall compatible versions
# !pip install gensim
# !pip install numpy==1.24.3 --force-reinstall

In [11]:
from gensim.models import KeyedVectors
fasttext_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Datasets/cc.my.300.vec', binary=False)
# https://fasttext.cc/docs/en/crawl-vectors.html choose Burmese choose text .vec file

In [12]:
def build_vocab(dataset):
    word_set = set()
    for sentence in dataset.sentences:
        for word in sentence:
            word_set.add(word)
    return word_set

vocab = build_vocab(train_data)
vocab_size = len(vocab)
embedding_dim = 300
embedding_matrix = []

# Add PAD and UNK embeddings
embedding_matrix.append(np.zeros(embedding_dim))  # <PAD>
embedding_matrix.append(np.random.uniform(-0.25, 0.25, embedding_dim))  # <UNK>


for word in vocab:
    word2idx[word] = len(word2idx)
    if word in fasttext_model:
        embedding_matrix.append(fasttext_model[word])
    else:
        embedding_matrix.append(np.random.uniform(-0.25, 0.25, embedding_dim))

embedding_matrix = torch.tensor(embedding_matrix)

  embedding_matrix = torch.tensor(embedding_matrix)


In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    """
    Feedforward neural network with dropout and GELU activation.

    Parameters:
    - dim: Input dimension
    - expension_factor: Factor by which the hidden layer dimension will expand
    - dropout: Dropout probability for regularization
    """
    def __init__(self, dim, expension_factor, dropout):
        super(FeedForward, self).__init__()
        hidden_dim = dim * expension_factor
        self.fc1 = nn.Linear(dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, dim)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout1(F.gelu(self.fc1(x)))  # Apply GELU and dropout on the first layer
        return self.dropout2(self.fc2(x))  # Apply second linear layer with dropout

class Fourier(nn.Module):
    """
    Fourier layer applying FFT twice (along two dimensions).

    Parameters:
    - dropout: Dropout probability
    """
    def __init__(self, dropout=0.3):
        super(Fourier, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.act = nn.ReLU()

    def forward(self, x):
        # Apply FFT along the last dimension (real and imaginary parts)
        x = torch.fft.fft(x, dim=-1)
        x = torch.fft.fft(x, dim=1)  # Apply FFT along the second dimension as well
        x = self.act(x.real)  # Using only real part
        x = self.dropout(x)
        return x

class FNet(nn.Module):
    """
    FNet model combining Fourier transformations and FeedForward layers.

    Parameters:
    - dim: Input dimension
    - expension_factor: Expansion factor for FeedForward layers
    - dropout: Dropout probability for regularization
    """
    def __init__(self, dim, expension_factor, dropout,embedding_matrix):

        super(FNet, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # Freeze embeddings
        self.fourier = Fourier(dropout)
        self.ffn = FeedForward(dim, expension_factor, dropout)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)

    def forward(self, x):
        x = self.embedding(x)
        residual = x
        x = self.fourier(x)  # Apply Fourier transformation
        x = self.norm1(x + residual)  # Apply LayerNorm with residual
        residual = x
        x = self.ffn(x)  # Apply FeedForward network
        out = self.norm2(x + residual)  # Apply LayerNorm with residual
        return out


In [17]:
vocab_size

16734

In [19]:
model = FNet(vocab_size,expension_factor=4,dropout=0.3,embedding_matrix=embedding_matrix)

In [None]:
# model summary