In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
class DiacritizationDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [4]:
class BLSTMModel(nn.Module):
    def __init__(self, vocab_size, num_classes, embedding_dim=256, hidden_size=128, num_layers=2):
        super(BLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=13)

        self.lstm_layers = nn.ModuleList()
        for i in range(num_layers):
            input_size = embedding_dim if i == 0 else hidden_size * 2
            lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True, dropout=0.2)
            self.lstm_layers.append(lstm)
        
        self.dropout = nn.Dropout(0.3)
        self.time_distributed = nn.Sequential(
            nn.Linear(hidden_size * 2, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, x):
        x = self.embedding(x)

        for lstm in self.lstm_layers:
            x, _ = lstm(x)
            x = self.dropout(x)
        
        x = self.time_distributed(x)
        return x

In [5]:
def calculate_accuracy(outputs, targets, ignore_index=15):
    predictions = outputs.argmax(dim=-1)
    
    mask = targets != ignore_index
    
    correct = (predictions == targets) & mask
    accuracy = correct.float().sum() / mask.float().sum()
    
    return accuracy.item()

In [6]:
def train_model(model, train_loader, val_loader, epochs=10, learning_rate=0.001):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    loss_fn = nn.CrossEntropyLoss(ignore_index=15)  # 15 is padding index
    
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    best_val_acc = 0.0
    best_model_path = "best_model.pth"

    
    for epoch in range(epochs):
        model.train()
        epoch_train_loss = 0
        epoch_train_acc = 0
        train_batches = 0
        
        with tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]') as pbar:
            for batch_X, batch_y in pbar:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                optimizer.zero_grad()
                
                outputs = model(batch_X)
                
                outputs_reshaped = outputs.reshape(-1, outputs.shape[-1])
                targets_reshaped = batch_y.reshape(-1)
                
                loss = loss_fn(outputs_reshaped, targets_reshaped)
                acc = calculate_accuracy(outputs, batch_y)
                
                loss.backward()
                
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                
                epoch_train_loss += loss.item()
                epoch_train_acc += acc
                train_batches += 1
                
                pbar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Acc': f'{acc:.4f}'
                })
        
        model.eval()
        epoch_val_loss = 0
        epoch_val_acc = 0
        val_batches = 0
        
        with torch.no_grad():
            with tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]') as pbar:
                for batch_X, batch_y in pbar:
                    batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                    
                    outputs = model(batch_X)
                    
                    outputs_reshaped = outputs.reshape(-1, outputs.shape[-1])
                    targets_reshaped = batch_y.reshape(-1)
                    
                    loss = loss_fn(outputs_reshaped, targets_reshaped)
                    acc = calculate_accuracy(outputs, batch_y)
                    
                    epoch_val_loss += loss.item()
                    epoch_val_acc += acc
                    val_batches += 1
                    
                    pbar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Acc': f'{acc:.4f}'
                    })
        
        avg_train_loss = epoch_train_loss / train_batches
        avg_train_acc = epoch_train_acc / train_batches
        avg_val_loss = epoch_val_loss / val_batches
        avg_val_acc = epoch_val_acc / val_batches
        
        train_losses.append(avg_train_loss)
        train_accuracies.append(avg_train_acc)
        val_losses.append(avg_val_loss)
        val_accuracies.append(avg_val_acc)
        
        if avg_val_acc > best_val_acc:
            best_val_acc = avg_val_acc
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_accuracy': avg_val_acc,
                'val_loss': avg_val_loss,
                'train_accuracy': avg_train_acc,
                'train_loss': avg_train_loss,
            }, best_model_path)
            print(f"New best model saved! Validation Accuracy: {avg_val_acc:.4f}")
        
        scheduler.step(avg_val_loss)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'  Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f}')
        print(f'  Val Loss: {avg_val_loss:.4f}, Val Acc: {avg_val_acc:.4f}')
        print(f'  Best Val Acc: {best_val_acc:.4f}')
        print(f'  Learning Rate: {optimizer.param_groups[0]["lr"]:.6f}')
    
    checkpoint = torch.load(best_model_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"\nTraining completed! Best validation accuracy: {checkpoint['val_accuracy']:.4f}")
    
    return {
        'train_loss': train_losses,
        'train_accuracy': train_accuracies,
        'val_loss': val_losses,
        'val_accuracy': val_accuracies,
        'best_val_accuracy': best_val_acc
    }

In [7]:
X_train = np.load('../data/X_train.npy')
y_train = np.load('../data/y_train.npy')
X_val = np.load('../data/X_val.npy')
y_val = np.load('../data/y_val.npy')

In [8]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (185074, 1236)
y_train shape: (185074, 1236)
X_val shape: (9000, 1236)
y_val shape: (9000, 1236)


In [9]:
vocab_size = len(pickle.load(open('../utils/letter2idx.pickle', 'rb'))) 
num_classes = len(pickle.load(open('../utils/diacritic2id.pickle', 'rb')))
print("Vocab size:", vocab_size)
print("Num classes:", num_classes)

Vocab size: 38
Num classes: 16


In [10]:
train_dataset = DiacritizationDataset(X_train, y_train)
val_dataset = DiacritizationDataset(X_val, y_val)

In [11]:
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [12]:
model = BLSTMModel(vocab_size, num_classes).to(device)



In [None]:
# Print model summary
print("\nModel Summary:")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# Train model
print("\nStarting training...")
history = train_model(model, train_loader, val_loader, epochs=10)

# Plot results
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history['train_accuracy'], label='Training Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['train_loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': vocab_size,
    'num_classes': num_classes,
    'history': history
}, 'blstm_model.pth')

print("Model saved successfully!")


Model Summary:
Total parameters: 870,160

Starting training...
