In [15]:
import csv

def load_tsv_data(filepath):
    """
    Reads a TSV file and returns a list of (root_form, inflectional_info, inflected_form) tuples.
    """
    data = []
    with open(filepath, "r", encoding="utf-8") as file:
        reader = csv.reader(file, delimiter="\t")  # TSV format
        for row in reader:
            if len(row) == 3:  # Ensure correct format
                root, inflected, info = row
                data.append((root, info, inflected))  # Maintain (root, info, inflected) order
    return data

def load_test_data(filepath):
    """
    Reads a TSV file and returns a list of (root_form, inflected_form) tuples.
    """
    data = []
    with open(filepath, "r", encoding="utf-8") as file:
        reader = csv.reader(file, delimiter="\t")  # TSV format
        for row in reader:
            if len(row) == 2:  # Ensure correct format
                root, info = row
                data.append((root, info))  # Maintain (root, inflected) order
    return data

In [16]:
from collections import defaultdict
import torch

class Vocabulary:
    def __init__(self):
        self.char2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2}  # Special tokens
        self.idx2char = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>"}
        self.feature2idx = {}  # Inflectional features
        self.idx2feature = {}

    def add_word(self, word):
        for char in word:
            if char not in self.char2idx:
                idx = len(self.char2idx)
                self.char2idx[char] = idx
                self.idx2char[idx] = char

    def add_feature(self, feature):
        features = feature.split(";")
        for feat in features:
            if feat not in self.feature2idx:
                idx = len(self.feature2idx)
                self.feature2idx[feat] = idx
                self.idx2feature[idx] = feat
        # if feature not in self.feature2idx:  
        #     idx = len(self.feature2idx)
        #     self.feature2idx[feature] = idx
        #     self.idx2feature[idx] = feature

    def encode_word(self, word):
        return [self.char2idx[char] for char in word] + [self.char2idx["<EOS>"]]

    def encode_feature(self, feature):
        features = feature.split(";")
        return [self.feature2idx[feat] if feat in self.feature2idx else 0 for feat in features]
        # return self.feature2idx[feature] if feature in self.feature2idx else 0  # Default to 0 if unknown


In [17]:
from torch.utils.data import Dataset
import torch

class InflectionDataset(Dataset):
    def __init__(self, filepath, vocab):
        """
        filepath: Path to the TSV file
        vocab: Instance of Vocabulary class
        """
        self.data = load_tsv_data(filepath)  # list of (root_form, inflectional_info, inflected_form) tuples
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        root_form, inflectional_info, inflected_form = self.data[idx]
        
        root_encoded = self.vocab.encode_word(root_form)
        inflected_encoded = [self.vocab.char2idx["<SOS>"]] + self.vocab.encode_word(inflected_form)
        feature_encoded = self.vocab.encode_feature(inflectional_info)

        return torch.tensor(root_encoded), torch.tensor(feature_encoded), torch.tensor(inflected_encoded)
    
class TestDataset(Dataset):
    def __init__(self, filepath, vocab):
        """
        root_data: List of root sequences.
        feature_data: List of features (inflectional information) corresponding to the roots.
        """
        self.data = load_test_data(filepath)  # list of (root_form, inflectional_info, inflected_form) tuples
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        root_form, inflectional_info = self.data[idx]
        
        root_encoded = self.vocab.encode_word(root_form)
        feature_encoded = self.vocab.encode_feature(inflectional_info)

        return torch.tensor(root_encoded), torch.tensor(feature_encoded)
    


In [18]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    batch: List of (root_encoded, feature_encoded, inflected_encoded) tuples
    Returns: Padded tensors for root forms, features, and inflected forms
    """
    roots, features, inflecteds = zip(*batch)

    # Pad sequences to the longest in the batch
    roots_padded = pad_sequence(roots, batch_first=True, padding_value=0)  
    inflecteds_padded = pad_sequence(inflecteds, batch_first=True, padding_value=0)
    # features = torch.tensor(features)  # Already numerical, no need for padding
    features = pad_sequence(features, batch_first=True, padding_value=0)

    return roots_padded, features, inflecteds_padded

def test_collate_fn(batch):
    """
    batch: List of (root_encoded, feature_encoded) tuples
    Returns: Padded tensors for root forms and features
    """
    roots, features = zip(*batch)

    # Pad sequences to the longest in the batch
    roots_padded = pad_sequence(roots, batch_first=True, padding_value=0)  
    # features = torch.tensor(features)  # Already numerical, no need for padding
    features = pad_sequence(features, batch_first=True, padding_value=0)

    return roots_padded, features

In [19]:
from torch.utils.data import DataLoader

def prepare_data(language):
    train_data = 'dataset/' + language + '.train.tsv'
    dev_data = 'dataset/' + language + '.dev.tsv'
    test_data = 'dataset/' + language + '.test.tsv'
    data = load_tsv_data(train_data)

    vocab = Vocabulary()
    for root, feat, inflected in data:
        vocab.add_word(root)
        vocab.add_word(inflected)
        vocab.add_feature(feat)
    
    return vocab, train_data, dev_data, test_data

def prepare_dataset(language, batch=16):
    vocab, train_data, dev_data, test_data = prepare_data(language)
    train_dataset = InflectionDataset(train_data, vocab)
    dev_dataset = InflectionDataset(dev_data, vocab)
    test_dataset = TestDataset(test_data, vocab)

    train_dataloader = DataLoader(train_dataset, batch_size=batch, shuffle=True, collate_fn=collate_fn)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch, shuffle=False, collate_fn=collate_fn)
    test_dataloader = DataLoader(test_dataset, batch_size=batch, shuffle=False, collate_fn=test_collate_fn)

    return vocab, train_dataloader, dev_dataloader, test_dataloader

In [20]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, feature_dim, padding_idx = 0):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=padding_idx)
        self.feature_embedding = nn.Embedding(feature_dim, emb_dim, padding_idx=padding_idx)

        self.lstm = nn.LSTM(emb_dim * 2, hidden_dim, batch_first=True)  # Concatenate both embeddings

        # self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, batch_first=True)  # LSTM input size doubled

        # self.feature_proj = nn.Linear(emb_dim, hidden_dim)

    def forward(self, root_seq, feature):
        """
        root_seq: (batch_size, root_seq_len)
        feature: (batch_size, feature_seq_len)
        """
        root_embedded = self.embedding(root_seq)
        feature_embedded = self.feature_embedding(feature)
        # print(root_seq.size(), feature.size())
        # print(root_embedded.size(), feature_embedded.size())
        combined_input = torch.cat((root_embedded, feature_embedded), dim=1)
        lstm_out, hidden = self.lstm(combined_input)  # Pass through LSTM
        return lstm_out, hidden
        # root_embedded = self.embedding(root_seq)
        # feature_embedded = self.feature_proj(self.feature_embedding(feature))  # Ensure hidden_dim match

        # # feature_embedded = self.feature_embedding(feature).unsqueeze(1) 
        # feature_expanded = feature_embedded.unsqueeze(1)  # (batch_size, 1, hidden_dim)
        # feature_expanded = feature_expanded.expand(-1, root_embedded.size(1), -1)  # (batch_size, seq_len, emb_dim)
        # lstm_input = torch.cat((root_embedded, feature_expanded), dim=-1)

        # # lstm_out, hidden = self.lstm(lstm_input)  # Pass through LSTM
        # h_0 = feature_embedded.unsqueeze(0)  # (num_layers, batch_size, hidden_dim)
        # c_0 = torch.zeros_like(h_0)  # Initialize cell state to zeros

        # lstm_out, hidden = self.lstm(lstm_input, (h_0, c_0))  # Pass through LSTM
    
        # return lstm_out, hidden  

In [21]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, hidden_dim)  # Project encoder outputs
        self.hidden_proj = nn.Linear(hidden_dim, hidden_dim)  # Project decoder hidden state
        self.energy = nn.Linear(hidden_dim, 1, bias=False)  # Energy computation

    def forward(self, hidden, encoder_outputs):
        """
        hidden: (num_layers, batch_size, hidden_dim)
        encoder_outputs: (batch_size, seq_len, hidden_dim)
        """
        hidden = self.hidden_proj(hidden[-1]).squeeze(0).unsqueeze(1)  # Ensure (batch_size, 1, hidden_dim)

        attn_applied = self.attn(encoder_outputs)  # Transform encoder outputs

        scores = self.energy(torch.tanh(attn_applied + hidden))  # Compute alignment scores

        attn_weights = torch.softmax(scores.squeeze(2), dim=1)  # (batch_size, seq_len)

        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, hidden_dim)

        return context.squeeze(1), attn_weights


In [22]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, sos_token, eos_token, device='cpu'):
        super().__init__()
        self.device = device
        self.sos_token = sos_token
        self.eos_token = eos_token
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        self.attention = Attention(hidden_dim)

    def forward(self, input_char, hidden, encoder_outputs):
        """
        input_char: (batch_size,)
        hidden: (num_layers, batch_size, hidden_dim)
        encoder_outputs: (batch_size, seq_len, hidden_dim)
        """

        embedded = self.embedding(input_char).unsqueeze(1)  # (batch_size, 1, emb_dim)

        # Compute attention
        context, attn_weights = self.attention(hidden, encoder_outputs)  # (batch_size, hidden_dim)

        # Concatenate context with input
        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)  # (batch_size, 1, emb_dim + hidden_dim)
        output, hidden = self.lstm(lstm_input, hidden)

        # Final output
        output = self.fc_out(torch.cat((output.squeeze(1), context), dim=1))  # (batch_size, output_dim)

        return output, hidden

    def predict(self, encoder_outputs, hidden, max_len=20):
        """ Decoder step during inference without teacher forcing """
        input_char = torch.tensor([self.sos_token]).to(self.device)  # Starting with <SOS> token
        
        predictions = []
        
        for _ in range(max_len):
            # Get the embedding of the input token
            embedded = self.embedding(input_char).unsqueeze(0)  # (1, 1, emb_dim)
            
            # Compute the context vector from the attention mechanism
            context, attn_weights = self.attention(hidden[0], encoder_outputs)  # (batch_size, hidden_dim)
            
            # Concatenate the context with the embedded input
            lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)  # (1, 1, emb_dim + hidden_dim)
            
            # Perform the LSTM forward pass
            output, hidden = self.lstm(lstm_input, hidden)
            
            # Predict the next token from the output
            output = self.fc_out(output.squeeze(1))  # (batch_size, vocab_size)
            pred_token = output.argmax(dim=1).item()  # Get the predicted token (most likely one)
            
            # Append the predicted token to the predictions list
            predictions.append(pred_token)
            
            # If the EOS token is predicted, stop the generation
            if pred_token == self.eos_token:
                break
            
            # The predicted token becomes the input for the next timestep
            input_char = torch.tensor([pred_token]).to(self.device)
        
        return predictions


In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, sos_token, eos_token, max_len=20):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.max_len = max_len

    def forward(self, root_seq, feature, target_seq, teacher_forcing_ratio=0.5):
        """ Training mode (uses teacher forcing) """
        encoder_outputs, hidden = self.encoder(root_seq, feature)
        outputs = []

        input_char = target_seq[:, 0]  # First input is the <SOS> token
        for t in range(1, target_seq.shape[1]):
            output, hidden = self.decoder(input_char, hidden, encoder_outputs)
            outputs.append(output.unsqueeze(1))

            # Decide whether to use teacher forcing
            use_teacher_forcing = torch.rand(1).item() < teacher_forcing_ratio
            input_char = target_seq[:, t] if use_teacher_forcing else output.argmax(1)

        return torch.cat(outputs, dim=1)

    def predict(self, root_seq, feature, max_len=None):
        """ Inference mode (no teacher forcing) """
        self.eval()  # Set the model to evaluation mode
        encoder_outputs, hidden = self.encoder(root_seq, feature)
        
        # Initialize the input to the decoder with the <SOS> token
        input_char = torch.full((root_seq.size(0), 1), self.sos_token, dtype=torch.long).to(self.device)
        predicted_sequence = []

        # Use the decoder to predict the next token step-by-step
        for t in range(self.max_len):
            output, hidden = self.decoder.predict(input_char, hidden, encoder_outputs)
            
            # Get the predicted token (argmax gives the most likely token)
            predicted_token = output.argmax(2)  # Assuming output is (batch_size, 1, vocab_size)
            predicted_sequence.append(predicted_token)

            # Stop predicting if the <EOS> token is reached
            if predicted_token.item() == self.eos_token:
                break

            # Update input for the next time step (use predicted token)
            input_char = predicted_token

        # Concatenate the predicted tokens
        predicted_sequence = torch.cat(predicted_sequence, dim=1)
        return predicted_sequence

In [24]:
import torch

def train_model(dataloader, model, optimizer, criterion, valid_loader, num_epochs=20, save_path="best_model.pth"):
    model.train()  # Ensure the model is in training mode
    best_val_loss = float('inf')  # Initialize the best validation loss to a large number
    patience_counter = 0  # To implement early stopping (optional)

    for epoch in range(num_epochs):
        # Training loop
        for root_seq, feature, target_seq in dataloader:
            optimizer.zero_grad()
            
            output = model(root_seq, feature, target_seq)
            loss = criterion(output.view(-1, output.shape[-1]), target_seq[:, 1:].contiguous().view(-1))
            
            loss.backward()
            optimizer.step()

        # Validation loop
        val_loss = validate(model, valid_loader, criterion)
        print(f"Epoch {epoch+1}, Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}")
        
        # Save the model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0  # Reset patience counter if validation improves
            torch.save(model.state_dict(), save_path)  # Save the model at the best validation step
            print(f"Model saved at epoch {epoch+1}")
        else:
            patience_counter += 1
        
        # Early stopping (optional)
        if patience_counter >= 5:
            print("Early stopping triggered.")
            break

# Helper validation function
def validate(model, valid_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No gradients needed for validation
        for root_seq, feature, target_seq in valid_loader:
            output = model(root_seq, feature, target_seq)
            loss = criterion(output.view(-1, output.shape[-1]), target_seq[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    avg_val_loss = total_loss / len(valid_loader)
    model.train()  # Return the model to training mode after validation
    return avg_val_loss


In [25]:
def predict_and_save(seq2seq_model, dataloader, vocab, output_file, max_len=10):
    """
    Generates the predicted inflected forms for data in the dataloader and saves to a file.
    
    Parameters:
    - seq2seq_model: The trained Seq2Seq model.
    - dataloader: DataLoader containing test data with root sequences and features.
    - vocab: Vocabulary mapping for converting token indices to characters.
    - max_len: Maximum length of the generated sequence.
    - output_file: File path to save the predicted inflected forms.
    """
    seq2seq_model.eval()
    device = seq2seq_model.device
    
    with open(output_file, "w") as f:
        for root_seq, feature in dataloader:
            root_seq, feature = root_seq.to(device), feature.to(device)
            batch_size = root_seq.shape[0]
            
            # Encode the input
            with torch.no_grad():
                encoder_outputs, hidden = seq2seq_model.encoder(root_seq, feature)
            
            # Initialize decoder input with <SOS> token
            input_char = torch.full((batch_size,), seq2seq_model.sos_token, dtype=torch.long, device=device)
            
            predicted_seq = [[] for _ in range(batch_size)]
            
            for t in range(max_len):
                with torch.no_grad():
                    output, hidden = seq2seq_model.decoder(input_char, hidden, encoder_outputs)
                
                predicted_char = output.argmax(1)  # Get the most likely character index
                
                for i in range(batch_size):
                    if predicted_char[i].item() == seq2seq_model.eos_token:
                        continue  # Stop predicting for this sequence if <EOS> is reached
                    predicted_seq[i].append(predicted_char[i].item())
                
                input_char = predicted_char  # Next input is the predicted character
            
            # Convert predicted indices to characters and save to file
            for seq in predicted_seq:
                predicted_form = ''.join([vocab.idx2char[idx] for idx in seq])
                f.write(predicted_form + "\n")


In [26]:
languages = ['xty', 'kbd', 'swc']

for language in languages:
    vocab, train_dataloader, dev_dataloader, test_dataloader = prepare_dataset(language, batch=128)

    # Initialize model, optimizer, and loss function
    encoder = Encoder(input_dim=len(vocab.char2idx), emb_dim=256, hidden_dim=256, feature_dim=len(vocab.feature2idx))
    decoder = Decoder(output_dim=len(vocab.char2idx), emb_dim=256, hidden_dim=256, sos_token=vocab.char2idx["<SOS>"], eos_token=vocab.char2idx["<EOS>"])
    model = Seq2Seq(encoder, decoder, device="cpu", sos_token=vocab.char2idx["<SOS>"], eos_token=vocab.char2idx["<EOS>"])

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    train_model(train_dataloader, model, optimizer, criterion, dev_dataloader)

    # Load the best model and predict on the test set
    model.load_state_dict(torch.load("best_model.pth", map_location="cpu"))
    predict_and_save(model, test_dataloader, vocab, output_file=language+'.txt')

Epoch 1, Training Loss: 2.1590, Validation Loss: 2.1171
Model saved at epoch 1
Epoch 2, Training Loss: 1.2239, Validation Loss: 1.4436
Model saved at epoch 2
Epoch 3, Training Loss: 0.8431, Validation Loss: 0.8481
Model saved at epoch 3
Epoch 4, Training Loss: 0.3240, Validation Loss: 0.6240
Model saved at epoch 4
Epoch 5, Training Loss: 0.1805, Validation Loss: 0.5514
Model saved at epoch 5
Epoch 6, Training Loss: 0.2519, Validation Loss: 0.4855
Model saved at epoch 6
Epoch 7, Training Loss: 0.1717, Validation Loss: 0.4704
Model saved at epoch 7
Epoch 8, Training Loss: 0.1322, Validation Loss: 0.4242
Model saved at epoch 8
Epoch 9, Training Loss: 0.0776, Validation Loss: 0.3752
Model saved at epoch 9
Epoch 10, Training Loss: 0.0950, Validation Loss: 0.3722
Model saved at epoch 10
Epoch 11, Training Loss: 0.0661, Validation Loss: 0.4193
Epoch 12, Training Loss: 0.0742, Validation Loss: 0.3745
Epoch 13, Training Loss: 0.0993, Validation Loss: 0.3974
Epoch 14, Training Loss: 0.1955, Vali