# The Seq2Seq Georgian Verb Conjugator Model

In this notebook we will: 

- Set up Encoder, Decoder and Seq2Seq classes
- process the prepared data

In [42]:
# import necessary libraries
import ast 
import random
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

# import stored data for input and output dimensions in the model
%store -r feature_vocab_dict
%store -r target_vocab_dict

In [43]:
# Function to load and properly convert the sequences
def load_sequence_json(path):
    df = pd.read_json(path)
    return [ast.literal_eval(str(row[0])) for row in df.values]

# Load the sequences from the JSON files
X_train = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\X_train.json")
y_train = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\y_train.json")
X_val = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\X_val.json")
y_val = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\y_val.json")
X_test = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\X_test.json")
y_test = load_sequence_json(r"C:\Users\Home\Desktop\Bachelor Project\data\train-test-eval\y_test.json")

# Check the loaded sequences
print("X_train:", X_train[:2])  # Print first two examples for verification
print("y_train:", y_train[:2])  # Print first two examples for verification

X_train: [['<bos>', 'ა', 'შ', 'ე', 'ნ', 'ე', 'ბ', 'ს', 'ARGNO2S', 'ARGAC1S', 'IND', 'PRS', '<eos>'], ['<bos>', 'წ', 'ო', 'ნ', 'ი', 'ს', 'ARGNO2P', 'ARGAC3P', 'IND', 'PST', 'PRF', '<eos>']]
y_train: [['<bos>', 'მ', 'ა', 'შ', 'ე', 'ნ', 'ე', 'ბ', '<eos>'], ['<bos>', 'ა', 'გ', 'ე', 'წ', 'ო', 'ნ', 'ა', 'თ', '<eos>']]


In [44]:
# Encode the datasets into their index representations
X_train_idx = [[feature_vocab_dict[token] for token in sequence] for sequence in X_train]
X_val_idx = [[feature_vocab_dict[token] for token in sequence] for sequence in X_val]
X_test_idx = [[feature_vocab_dict[token] for token in sequence] for sequence in X_test]

print("Encoded X_train_idx:", X_train_idx[:2])  # Print first two examples for verification
print("Encoded X_val_idx:", X_val_idx[:2])    # Print first two examples for verification
print("Encoded X_test_idx:", X_test_idx[:2])  # Print first two examples for verification

# Encode the target datasets into their index representations
y_train_idx = [[target_vocab_dict[token] for token in sequence] for sequence in y_train]
y_val_idx = [[target_vocab_dict[token] for token in sequence] for sequence in y_val]
y_test_idx = [[target_vocab_dict[token] for token in sequence] for sequence in y_test]

print("Encoded y_train_idx:", y_train_idx[:2])  # Print first two examples for verification
print("Encoded y_val_idx:", y_val_idx[:2])    # Print first two examples for verification
print("Encoded y_test_idx:", y_test_idx[:2])  # Print first two examples for verification

Encoded X_train_idx: [[1, 3, 27, 7, 15, 7, 4, 20, 45, 37, 51, 56, 2], [1, 31, 16, 15, 11, 20, 44, 40, 51, 57, 55, 2]]
Encoded X_val_idx: [[1, 7, 33, 14, 3, 19, 7, 4, 3, 43, 39, 51, 49, 2], [1, 3, 33, 19, 28, 16, 4, 20, 45, 41, 51, 55, 2]]
Encoded X_test_idx: [[1, 22, 20, 14, 7, 15, 20, 47, 37, 48, 2], [1, 3, 10, 7, 10, 19, 7, 4, 20, 47, 39, 51, 57, 55, 2]]
Encoded y_train_idx: [[1, 14, 3, 27, 7, 15, 7, 4, 2], [1, 3, 5, 7, 31, 16, 15, 3, 10, 2]]
Encoded y_val_idx: [[1, 6, 3, 5, 7, 33, 14, 3, 19, 7, 4, 11, 2], [1, 6, 3, 5, 11, 33, 19, 28, 8, 11, 3, 2]]
Encoded y_test_idx: [[1, 14, 16, 14, 11, 20, 14, 7, 15, 6, 3, 2], [1, 5, 3, 7, 10, 7, 10, 19, 7, 4, 11, 15, 7, 2]]


In [45]:
# implement the dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

In [46]:
# implement encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [47]:
# implement decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x).unsqueeze(0)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [48]:
# implement seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[1]
        batch_size = trg.shape[0]
        output_dim = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        hidden, cell = self.encoder(src)

        x = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1) 
            x = trg[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs

In [None]:
### Training the model
# model hyperparameters
num_epochs = 40
batch_size = 64
# optimizer hyperparameters
learning_rate = 0.001

# training hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim_encoder = len(feature_vocab_dict)
output_dim_decoder = len(target_vocab_dict)
encoder_emb_dim = 100 # refer to guriel et al. 
decoder_emb_dim = 100 # refer to guriel et al.
hidden_dim = 128
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# tensorboard logging
writer = SummaryWriter('runs/seq2seq_experiment_1')
steps = 0

# setup data loaders with collate function
def collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    X_batch = nn.utils.rnn.pad_sequence(X_batch, batch_first=True, padding_value=0)
    y_batch = nn.utils.rnn.pad_sequence(y_batch, batch_first=True, padding_value=0)
    return X_batch, y_batch
# create datasets and dataloaders
train_dataset = CustomDataset(X_train_idx, y_train_idx)
val_dataset = CustomDataset(X_val_idx, y_val_idx)
test_dataset = CustomDataset(X_test_idx, y_test_idx)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [61]:
# initialize encoder, decoder, and seq2seq model
encoder = Encoder(input_dim_encoder, encoder_emb_dim, hidden_dim).to(device)
decoder = Decoder(output_dim_decoder, decoder_emb_dim, hidden_dim).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

# define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=feature_vocab_dict['<pad>']) # ignore padding index in loss calculation

# training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    for batch_idx, (src, trg) in enumerate(train_loader):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)

        # reshape output and target for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        # gradient clipping (optional)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # log to tensorboard
    writer.add_scalar('Training Loss', avg_loss, global_step=epoch)

Epoch [1/40], Loss: 2.2850
Epoch [2/40], Loss: 1.3750
Epoch [3/40], Loss: 0.7782
Epoch [4/40], Loss: 0.4775
Epoch [5/40], Loss: 0.2996
Epoch [6/40], Loss: 0.1964
Epoch [7/40], Loss: 0.1273
Epoch [8/40], Loss: 0.0895
Epoch [9/40], Loss: 0.0647
Epoch [10/40], Loss: 0.0468
Epoch [11/40], Loss: 0.0366
Epoch [12/40], Loss: 0.0302
Epoch [13/40], Loss: 0.0219
Epoch [14/40], Loss: 0.0220
Epoch [15/40], Loss: 0.0136
Epoch [16/40], Loss: 0.0159
Epoch [17/40], Loss: 0.0246
Epoch [18/40], Loss: 0.0104
Epoch [19/40], Loss: 0.0097
Epoch [20/40], Loss: 0.0064
Epoch [21/40], Loss: 0.0167
Epoch [22/40], Loss: 0.0185
Epoch [23/40], Loss: 0.0058
Epoch [24/40], Loss: 0.0038
Epoch [25/40], Loss: 0.0017
Epoch [26/40], Loss: 0.0016
Epoch [27/40], Loss: 0.0012
Epoch [28/40], Loss: 0.0530
Epoch [29/40], Loss: 0.0130
Epoch [30/40], Loss: 0.0030
Epoch [31/40], Loss: 0.0015
Epoch [32/40], Loss: 0.0011
Epoch [33/40], Loss: 0.0009
Epoch [34/40], Loss: 0.0008
Epoch [35/40], Loss: 0.0008
Epoch [36/40], Loss: 0.0008
E

In [62]:
PAD_IDX = feature_vocab_dict['<pad>']  # dynamically set padding index
target_idx_to_token = {idx: tok for tok, idx in target_vocab_dict.items()}

model.eval()
val_loss = 0
val_predictions = []
val_targets = []

with torch.no_grad():
    for src, trg in val_loader:
        src, trg = src.to(device), trg.to(device)

        output = model(src, trg, teacher_forcing_ratio=0)  # no teacher forcing

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        val_loss += loss.item() * src.size(0)

        pred = output.argmax(1)
        mask = trg != PAD_IDX

        # Filter out pad tokens for accuracy
        filtered_pred = pred[mask].cpu().numpy()
        filtered_trg = trg[mask].cpu().numpy()

        val_predictions.extend(filtered_pred)
        val_targets.extend(filtered_trg)

        # Print 5 example sequences
        batch_size = src.size(0)
        seq_len = trg.view(batch_size, -1).size(1)
        pred_seq = pred.view(batch_size, -1)
        trg_seq = trg.view(batch_size, -1)

        for i in range(min(5, batch_size)):
            pred_tokens = [target_idx_to_token[idx.item()] for idx in pred_seq[i] if idx.item() != PAD_IDX]
            target_tokens = [target_idx_to_token[idx.item()] for idx in trg_seq[i] if idx.item() != PAD_IDX]

            print(f"Target   : {' '.join(target_tokens)}")
            print(f"Predicted: {' '.join(pred_tokens)}")
            print('-' * 40)

    avg_val_loss = val_loss / len(val_dataset)
    val_accuracy = accuracy_score(val_targets, val_predictions)

    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    writer.add_scalar('Validation Loss', avg_val_loss, global_step=epoch)
    writer.add_scalar('Validation Accuracy', val_accuracy, global_step=epoch)

Target   : დ ა გ ე ხ მ ა რ ე ბ ი <eos>
Predicted: დ ა გ ე ხ მ ა რ ე ბ ი <eos> <eos> <eos> <eos> <eos>
----------------------------------------
Target   : დ ა გ ი ხ რ ჩ ვ ი ა <eos>
Predicted: დ ა გ ი ხ რ ჩ ვ ი ა <eos> <eos> <eos> <eos> <eos> <eos>
----------------------------------------
Target   : გ ა ე ც ა ნ ი თ <eos>
Predicted: გ ა ე ც ა ნ ი თ <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
----------------------------------------
Target   : შ ე ჭ მ უ ლ ა ნ <eos>
Predicted: შ ე ჭ მ უ ლ ი ა ნ <eos> <eos> ნ <eos> <eos> <eos> <eos>
----------------------------------------
Target   : ვ ი ც ი ნ ე ბ დ ე თ <eos>
Predicted: ვ ი ც ი ნ ე ბ დ ე თ <eos> <eos> <eos> <eos> <eos> <eos>
----------------------------------------
Target   : დ ა ა ნ თ ე <eos>
Predicted: დ ა ა ნ თ ე <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos> <eos>
----------------------------------------
Target   : დ ა გ ი ნ გ რ ე ვ ი ა <eos>
Predicted: დ ა გ ი ნ გ რ ე ვ ი ა <eos> <eos> <eos> <eos> <eos>
----------------------

In [63]:
# meaure the model performance on the test set
model.eval()
test_loss = 0
test_predictions = []
test_targets = []
with torch.no_grad():
    for src, trg in test_loader:
        src, trg = src.to(device), trg.to(device)

        output = model(src, trg, teacher_forcing_ratio=0)  # no teacher forcing

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        test_loss += loss.item() * src.size(0)

        pred = output.argmax(1)
        mask = trg != PAD_IDX

        # Filter out pad tokens for accuracy
        filtered_pred = pred[mask].cpu().numpy()
        filtered_trg = trg[mask].cpu().numpy()

        test_predictions.extend(filtered_pred)
        test_targets.extend(filtered_trg)

    avg_test_loss = test_loss / len(test_dataset)
    test_accuracy = accuracy_score(test_targets, test_predictions)

    print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.0255, Test Accuracy: 0.9967
