In [1]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root added to path: {project_root}")
print(f"Current working directory: {Path.cwd()}")

Project root added to path: /home/mohamed-ashraf/Desktop/projects/Arabic-Diacritization
Current working directory: /home/mohamed-ashraf/Desktop/projects/Arabic-Diacritization/models/blstm


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import matplotlib.pyplot as plt

from tqdm import tqdm
from utils.utils import create_data_pipeline
from models.blstm.blstm import BLSTM

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [5]:
with open(project_root / "utils/letter2idx.pickle", "rb") as file:
    letter2idx = pickle.load(file)

with open(project_root / "utils/diacritic2id.pickle", "rb") as file:
    diacritic2id = pickle.load(file)

idx2letter = {value: key for key, value in letter2idx.items()}
idx2diacritic = {value: key for key, value in diacritic2id.items()}

print(letter2idx)
print(idx2letter)
print(diacritic2id)
print(idx2diacritic)

{'ظ': 0, 'ي': 1, 'غ': 2, 'ن': 3, 'ق': 4, 'ذ': 5, 'د': 6, 'خ': 7, 'ر': 8, 'ط': 9, 'ى': 10, 'م': 11, 'ل': 12, '<PAD>': 13, 'ت': 14, 'ج': 15, 'آ': 16, 'ا': 17, 'س': 18, 'ئ': 19, 'ع': 20, 'ف': 21, 'ص': 22, 'ه': 23, 'ز': 24, 'ك': 25, 'ش': 26, 'أ': 27, 'و': 28, 'ب': 29, 'ؤ': 30, 'ض': 31, 'ة': 32, 'ث': 33, 'ء': 34, 'ح': 35, 'إ': 36, ' ': 37}
{0: 'ظ', 1: 'ي', 2: 'غ', 3: 'ن', 4: 'ق', 5: 'ذ', 6: 'د', 7: 'خ', 8: 'ر', 9: 'ط', 10: 'ى', 11: 'م', 12: 'ل', 13: '<PAD>', 14: 'ت', 15: 'ج', 16: 'آ', 17: 'ا', 18: 'س', 19: 'ئ', 20: 'ع', 21: 'ف', 22: 'ص', 23: 'ه', 24: 'ز', 25: 'ك', 26: 'ش', 27: 'أ', 28: 'و', 29: 'ب', 30: 'ؤ', 31: 'ض', 32: 'ة', 33: 'ث', 34: 'ء', 35: 'ح', 36: 'إ', 37: ' '}
{'َ': 0, 'ً': 1, 'ُ': 2, 'ٌ': 3, 'ِ': 4, 'ٍ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ُّ': 10, 'ٌّ': 11, 'ِّ': 12, 'ٍّ': 13, '': 14, '<PAD>': 15}
{0: 'َ', 1: 'ً', 2: 'ُ', 3: 'ٌ', 4: 'ِ', 5: 'ٍ', 6: 'ْ', 7: 'ّ', 8: 'َّ', 9: 'ًّ', 10: 'ُّ', 11: 'ٌّ', 12: 'ِّ', 13: 'ٍّ', 14: '', 15: '<PAD>'}


In [6]:
vocab_size = len(letter2idx) 
num_classes = len(diacritic2id)
print("Vocab size:", vocab_size)
print("Num classes:", num_classes)

Vocab size: 38
Num classes: 16


In [None]:
def train_model(model, train_loader, val_loader, epochs=10, learning_rate=0.0001):
    criterion = nn.CrossEntropyLoss(ignore_index=diacritic2id['<PAD>'])
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    best_val_loss = float('inf')
    best_model_path = 'best_lstm_model.pth'
    
    for epoch in range(epochs):
        model.train()

        total_train_loss = 0
        total_train_correct = 0
        total_train_tokens = 0

        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]') as pbar:
            for batch_X, batch_y, _, lengths in pbar:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                optimizer.zero_grad()

                outputs = model(batch_X, lengths)

                B, T, C = outputs.shape
                loss = criterion(outputs.view(B*T, C), batch_y.view(B*T))

                preds = outputs.argmax(dim=-1)
                mask = (batch_y != diacritic2id['<PAD>'])

                correct = (preds[mask] == batch_y[mask]).sum().item()
                total = mask.sum().item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                total_train_loss += loss.item()
                total_train_correct += correct
                total_train_tokens += total

                acc = correct / total if total > 0 else 0.0

                pbar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Acc': f'{acc:.4f}'
                })

        avg_train_loss = total_train_loss / len(train_loader)
        avg_train_acc = total_train_correct / total_train_tokens

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_tokens = 0

        with torch.no_grad():
            with tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]') as pbar:
                for batch_X, batch_y, _, lengths in pbar:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)

                    outputs = model(batch_X, lengths)
                    B, T, C = outputs.shape

                    loss = criterion(outputs.view(B*T, C), batch_y.view(B*T))

                    preds = outputs.argmax(dim=-1)
                    mask = (batch_y != diacritic2id['<PAD>'])

                    correct = (preds[mask] == batch_y[mask]).sum().item()
                    total = mask.sum().item()

                    total_val_loss += loss.item()
                    total_val_correct += correct
                    total_val_tokens += total

                    acc = correct / total if total > 0 else 0.0

                    pbar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Acc': f'{acc:.4f}'
                    })

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_acc = total_val_correct / total_val_tokens

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': best_val_loss,
                'val_accuracy': avg_val_acc,
                'train_loss': avg_train_loss,
                'train_accuracy': avg_train_acc
            }, best_model_path)
            print(f"  ↳ Best model saved! (val_loss: {best_val_loss:.4f})")

        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'  Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f}')
        print(f'  Val Loss: {avg_val_loss:.4f}, Val Acc: {avg_val_acc:.4f}')
        print(f'  LR: {optimizer.param_groups[0]["lr"]:.6f}')

        train_losses.append(avg_train_loss)
        train_accuracies.append(avg_train_acc)
        val_losses.append(avg_val_loss)
        val_accuracies.append(avg_val_acc)

    return {
        'train_loss': train_losses,
        'train_accuracy': train_accuracies,
        'val_loss': val_losses,
        'val_accuracy': val_accuracies
    }

In [8]:
def pad_collate_fn(batch):
    x_batch, y_batch, mask_batch = zip(*batch)
    lengths_x = [len(x) for x in x_batch]
    x_padded = torch.nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=letter2idx['<PAD>'])
    y_padded = torch.nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=diacritic2id['<PAD>'])
    mask_spadded = torch.nn.utils.rnn.pad_sequence(mask_batch, batch_first=True, padding_value=0)
    return x_padded, y_padded, mask_spadded, torch.tensor(lengths_x, dtype=torch.long)

In [9]:
train_dataset, train_loader = create_data_pipeline(
    corpus_path=str(project_root / 'data/train.txt'), 
    letter2idx=letter2idx, 
    diacritic2idx=diacritic2id, 
    collate_fn=pad_collate_fn,
    batch_size=32
)

val_dataset, val_loader = create_data_pipeline(
    corpus_path=str(project_root / 'data/val.txt'), 
    letter2idx=letter2idx, 
    diacritic2idx=diacritic2id,
    collate_fn=pad_collate_fn,
    train=False, 
    batch_size=32
)

In [10]:
model = BLSTM(vocab_size=vocab_size, num_classes=num_classes).to(device)

print("Model architecture:")
print(model)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

Model architecture:
BLSTM(
  (embedding): Embedding(38, 128, padding_idx=13)
  (bilstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=16, bias=True)
)

Total parameters: 803,600
Trainable parameters: 803,600


In [None]:
print("Model Summary:")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

history = train_model(model, train_loader, val_loader, epochs=10)

# Plot results
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history['train_accuracy'], label='Training Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': vocab_size,
    'num_classes': num_classes,
    'history': history
}, 'blstm_model.pth')

print("Model saved successfully as 'blstm_model.pth'!")

\nModel Summary:
Total parameters: 803,600
\nStarting training with dynamic sequence lengths...


Epoch 1/10 [Train]: 100%|██████████| 5784/5784 [10:47<00:00,  8.94it/s, Loss=0.1161, Acc=0.9652]
Epoch 1/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.87it/s, Loss=0.0943, Acc=0.9717]


  ↳ Best model saved! (val_loss: 0.1055)
Epoch 1/10:
  Train Loss: 0.0933, Train Acc: 0.9706
  Val Loss: 0.1055, Val Acc: 0.9673
  LR: 0.000100


Epoch 2/10 [Train]: 100%|██████████| 5784/5784 [10:46<00:00,  8.94it/s, Loss=0.0681, Acc=0.9766]
Epoch 2/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.96it/s, Loss=0.0964, Acc=0.9717]


  ↳ Best model saved! (val_loss: 0.1045)
Epoch 2/10:
  Train Loss: 0.0887, Train Acc: 0.9721
  Val Loss: 0.1045, Val Acc: 0.9677
  LR: 0.000100


Epoch 3/10 [Train]: 100%|██████████| 5784/5784 [10:46<00:00,  8.95it/s, Loss=0.0756, Acc=0.9652]
Epoch 3/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.92it/s, Loss=0.0888, Acc=0.9717]


  ↳ Best model saved! (val_loss: 0.1042)
Epoch 3/10:
  Train Loss: 0.0868, Train Acc: 0.9727
  Val Loss: 0.1042, Val Acc: 0.9679
  LR: 0.000100


Epoch 4/10 [Train]: 100%|██████████| 5784/5784 [10:52<00:00,  8.86it/s, Loss=0.1235, Acc=0.9679]
Epoch 4/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.51it/s, Loss=0.0875, Acc=0.9717]


  ↳ Best model saved! (val_loss: 0.1036)
Epoch 4/10:
  Train Loss: 0.0853, Train Acc: 0.9732
  Val Loss: 0.1036, Val Acc: 0.9682
  LR: 0.000100


Epoch 5/10 [Train]: 100%|██████████| 5784/5784 [10:54<00:00,  8.84it/s, Loss=0.0543, Acc=0.9827]
Epoch 5/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.52it/s, Loss=0.0866, Acc=0.9717]


  ↳ Best model saved! (val_loss: 0.1034)
Epoch 5/10:
  Train Loss: 0.0840, Train Acc: 0.9736
  Val Loss: 0.1034, Val Acc: 0.9682
  LR: 0.000100


Epoch 6/10 [Train]: 100%|██████████| 5784/5784 [10:54<00:00,  8.84it/s, Loss=0.1283, Acc=0.9594]
Epoch 6/10 [Val]: 100%|██████████| 282/282 [00:14<00:00, 19.53it/s, Loss=0.0812, Acc=0.9753]


  ↳ Best model saved! (val_loss: 0.1031)
Epoch 6/10:
  Train Loss: 0.0827, Train Acc: 0.9740
  Val Loss: 0.1031, Val Acc: 0.9683
  LR: 0.000100


Epoch 7/10 [Train]:  56%|█████▌    | 3213/5784 [06:04<04:47,  8.93it/s, Loss=0.0828, Acc=0.9718]