In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import pickle

from models.blstm.blstm import BLSTM
from utils.data_loader import DiacritizationDataset

In [3]:
!pip install gdown --upgrade
!gdown https://drive.google.com/uc?id=1He1Zv5SkAE2BBrld50zdQO8Ap8VjluVi
!gdown https://drive.google.com/uc?id=1ALgBlWg4sE2nwIyawAMHMyLd68ddyi_5
!gdown https://drive.google.com/uc?id=1QKtfvQ-isJrqJWLFSQCN2XvN9jB5EM8-
!gdown https://drive.google.com/uc?id=1rbRye8jkI7myGMqXlUWAaxz-KP1SbKHF
!gdown https://drive.google.com/uc?id=1L4efPQpPUnZDlJxNYRpcBGEmBf4osnzg
!gdown https://drive.google.com/uc?id=1w6aSOc5FaHjPcrTqlx8gkYtxkr-InvLI

Downloading...
From (original): https://drive.google.com/uc?id=1He1Zv5SkAE2BBrld50zdQO8Ap8VjluVi
From (redirected): https://drive.google.com/uc?id=1He1Zv5SkAE2BBrld50zdQO8Ap8VjluVi&confirm=t&uuid=24509dc4-d0c3-4b53-aa2a-86f18526e174
To: /content/X_train.npy
100% 915M/915M [00:07<00:00, 119MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1ALgBlWg4sE2nwIyawAMHMyLd68ddyi_5
To: /content/X_val.npy
100% 44.5M/44.5M [00:00<00:00, 52.9MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1QKtfvQ-isJrqJWLFSQCN2XvN9jB5EM8-
From (redirected): https://drive.google.com/uc?id=1QKtfvQ-isJrqJWLFSQCN2XvN9jB5EM8-&confirm=t&uuid=421d6f48-96de-46e3-bec0-97f004dc9147
To: /content/y_train.npy
100% 915M/915M [00:07<00:00, 116MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rbRye8jkI7myGMqXlUWAaxz-KP1SbKHF
To: /content/y_val.npy
100% 44.5M/44.5M [00:00<00:00, 63.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1L4efPQpPUnZDlJxNYRpcBGEmBf4osnzg
To: /content/letter2i

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [23]:
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
X_val = np.load('X_val.npy')
y_val = np.load('y_val.npy')

In [24]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (185074, 1236)
y_train shape: (185074, 1236)
X_val shape: (9000, 1236)
y_val shape: (9000, 1236)


In [25]:
vocab_size = len(pickle.load(open('letter2idx.pickle', 'rb')))
num_classes = len(pickle.load(open('diacritic2id.pickle', 'rb')))
print("Vocab size:", vocab_size)
print("Num classes:", num_classes)

Vocab size: 38
Num classes: 16


In [29]:
import pickle

# Load the letter2idx mapping
with open("letter2idx.pickle", "rb") as file:
    letter2idx = pickle.load(file)

# Create inverse mapping
idx2letter = {value: key for key, value in letter2idx.items()}

print(letter2idx)
print(idx2letter)

{'ظ': 0, 'ي': 1, 'غ': 2, 'ن': 3, 'ق': 4, 'ذ': 5, 'د': 6, 'خ': 7, 'ر': 8, 'ط': 9, 'ى': 10, 'م': 11, 'ل': 12, '<PAD>': 13, 'ت': 14, 'ج': 15, 'آ': 16, 'ا': 17, 'س': 18, 'ئ': 19, 'ع': 20, 'ف': 21, 'ص': 22, 'ه': 23, 'ز': 24, 'ك': 25, 'ش': 26, 'أ': 27, 'و': 28, 'ب': 29, 'ؤ': 30, 'ض': 31, 'ة': 32, 'ث': 33, 'ء': 34, 'ح': 35, 'إ': 36, ' ': 37}
{0: 'ظ', 1: 'ي', 2: 'غ', 3: 'ن', 4: 'ق', 5: 'ذ', 6: 'د', 7: 'خ', 8: 'ر', 9: 'ط', 10: 'ى', 11: 'م', 12: 'ل', 13: '<PAD>', 14: 'ت', 15: 'ج', 16: 'آ', 17: 'ا', 18: 'س', 19: 'ئ', 20: 'ع', 21: 'ف', 22: 'ص', 23: 'ه', 24: 'ز', 25: 'ك', 26: 'ش', 27: 'أ', 28: 'و', 29: 'ب', 30: 'ؤ', 31: 'ض', 32: 'ة', 33: 'ث', 34: 'ء', 35: 'ح', 36: 'إ', 37: ' '}


In [30]:
train_dataset = DiacritizationDataset(X_train, y_train, idx2letter)
val_dataset = DiacritizationDataset(X_val, y_val, idx2letter)

In [31]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [32]:
model = BLSTM(vocab_size, num_classes).to(device)

In [33]:
model.load_state_dict(torch.load("blstm_model.pth", map_location=device))
model.to(device)
model.eval()

BLSTM(
  (embedding): Embedding(38, 128, padding_idx=13)
  (bilstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=16, bias=True)
)

In [34]:
@torch.no_grad()
def evaluate_full_sequence(model, data_loader):
    model.eval()
    total_correct = 0
    total_tokens = 0

    for batch_X, batch_y, batch_mask in data_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        outputs = model(batch_X)
        preds = outputs.argmax(dim=-1)

        mask = (batch_y != 15)  # ignore padding
        correct = (preds[mask] == batch_y[mask]).sum().item()
        total_correct += correct
        total_tokens += mask.sum().item()

    acc = total_correct / total_tokens if total_tokens > 0 else 0
    return acc


@torch.no_grad()
def evaluate_last_char_accuracy(model, data_loader):
    model.eval()
    total_correct = 0
    total_important = 0

    for batch_X, batch_y, batch_mask in data_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        batch_mask = batch_mask.to(device)

        outputs = model(batch_X)
        preds = outputs.argmax(dim=-1)

        mask = (batch_mask == 1)  # only last char of each word
        correct = (preds[mask] == batch_y[mask]).sum().item()
        total_correct += correct
        total_important += mask.sum().item()

    acc = total_correct / total_important if total_important > 0 else 0
    return acc

In [35]:
full_acc = evaluate_full_sequence(model, val_loader)
last_char_acc = evaluate_last_char_accuracy(model, val_loader)

print(f"Full sequence accuracy: {full_acc:.4f}")
print(f"Last char accuracy: {last_char_acc:.4f}")

Full sequence accuracy: 0.9584
Last char accuracy: 0.9346
