#<h1><center>**Natural Language Processing - XPOS MULTEXT East POS Project**</center></h1>
#<h1><center>**2023./2024.**</center></h1>
#<h2><center>*Grgur Živković, Mia Mužinić*</center></h1>


---


#<h1><center>**Model training**</center></h1>

## 1. Loading a preprocessed dataset



In [None]:
# Importing required libraries

import ast

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder
from torch.nn.utils.rnn import pad_sequence

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Učitavanje pripremljenih podataka
file_path_train = '/content/drive/MyDrive/NLP2024/hr500k-train.csv'
file_path_val = '/content/drive/MyDrive/NLP2024/hr500k-dev.csv'
file_path_test = '/content/drive/MyDrive/NLP2024/hr500k-test.csv'

data_train = pd.read_csv(file_path_train)
data_val = pd.read_csv(file_path_val)
data_test = pd.read_csv(file_path_test)

# Prikaz prvih nekoliko redova radi provjere
data_train.head()

Unnamed: 0,tokens,xpos
0,"['Kazna', 'medijskom', 'mogulu', 'obnovila', '...","['Ncfsn', 'Agpmsdy', 'Ncmsd', 'Vmp-sf', 'Ncfsa..."
1,"['Neki', 'tvrde', 'da', 'je', 'presuda', 'Veli...","['Pi-mpn', 'Vmr3p', 'Cs', 'Var3s', 'Ncfsn', 'N..."
2,"['Medijski', 'mogul', 'Velija', 'Ramkovski', '...","['Agpmsny', 'Ncmsn', 'Npmsn', 'Npmsn', 'Appmsn..."
3,"['Kaznena', 'presuda', 'i', 'zatvorska', 'kazn...","['Agpfsny', 'Ncfsn', 'Cc', 'Agpfsny', 'Ncfsn',..."
4,"['Ramkovski', ',', 'bivši', 'vlasnik', 'televi...","['Npmsn', 'Z', 'Agpmsny', 'Ncmsn', 'Agpfsgy', ..."


In [None]:
# Convert string representations of token sequences to lists of tokens for training data
X_train = [ast.literal_eval(sentence) for sentence in data_train['tokens'].tolist()]

# Convert string representations of token sequences to lists of tokens for validation data
X_valid = [ast.literal_eval(sentence) for sentence in data_val['tokens'].tolist()]

# Convert string representations of token sequences to lists of tokens for test data
X_test = [ast.literal_eval(sentence) for sentence in data_test['tokens'].tolist()]

# Convert string representations of XPOS tag sequences to lists of tags for training data
Y_train = [ast.literal_eval(sentence) for sentence in data_train['xpos'].tolist()]

# Convert string representations of XPOS tag sequences to lists of tags for validation data
Y_valid = [ast.literal_eval(sentence) for sentence in data_val['xpos'].tolist()]

# Convert string representations of XPOS tag sequences to lists of tags for test data
Y_test = [ast.literal_eval(sentence) for sentence in data_test['xpos'].tolist()]

## 2. Dataset class creation

In [None]:
# Convert words to integer indices
# Initialize an empty dictionary to store word-to-index mappings
word_to_idx = {}

# Initialize a counter for index values
idx_counter = 0

# Iterate over the training, validation, and test splits
for split in [X_train, X_valid, X_test]:
    # Iterate over each sentence in the split
    for sentence in split:
        # Iterate over each word in the sentence
        for word in sentence:
            # Check if the word is not already mapped to an index
            if word not in word_to_idx:
                # Map the word to the current index
                word_to_idx[word] = idx_counter
                # Increment the index counter
                idx_counter += 1

# Initialize an empty dictionary to store XPOS tag-to-index mappings
xpos_to_idx = {}

# Re-initialize the counter for index values
idx_counter = 0

# Iterate over the training, validation, and test splits for XPOS tags
for tag_split in [Y_train, Y_valid, Y_test]:
    # Iterate over each list of XPOS tags
    for tags in tag_split:
        # Iterate over each XPOS tag in the list
        for tag in tags:
            # Check if the XPOS tag is not already mapped to an index
            if tag not in xpos_to_idx:
                # Map the XPOS tag to the current index
                xpos_to_idx[tag] = idx_counter
                # Increment the index counter
                idx_counter += 1


In [None]:
# Number of unique words
len(word_to_idx)

73456

In [None]:
# Number of unique xpos tags
len(xpos_to_idx)

756

In [None]:
# Sanity check
dict(list(word_to_idx.items())[:11])

{'Kazna': 0,
 'medijskom': 1,
 'mogulu': 2,
 'obnovila': 3,
 'raspravu': 4,
 'u': 5,
 'Makedoniji': 6,
 'Neki': 7,
 'tvrde': 8,
 'da': 9,
 'je': 10}

In [None]:
# Sanity check
dict(list(pos_to_idx.items())[:11])

{'Ncfsn': 0,
 'Agpmsdy': 1,
 'Ncmsd': 2,
 'Vmp-sf': 3,
 'Ncfsa': 4,
 'Sl': 5,
 'Npfsl': 6,
 'Pi-mpn': 7,
 'Vmr3p': 8,
 'Cs': 9,
 'Var3s': 10}

In [None]:
len(X_train)

19791

In [None]:
len(X_valid)

2486

In [None]:
# Number of Train + Test data points
len((X_train+X_valid))

22277

In [None]:
# Define a PyTorch Dataset

class CustomDataset(Dataset):
    def __init__(self, sentences, xpos_tags, word_to_idx, xpos_to_idx):
        """
        Initializes the CustomDataset.

        Args:
        - sentences (list): List of sentences, where each sentence is represented as a list of words.
        - xpos_tags (list): List of XPOS tag sequences, where each sequence is represented as a list of tags.
        - word_to_idx (dict): Dictionary mapping words to their corresponding indices.
        - xpos_to_idx (dict): Dictionary mapping XPOS tags to their corresponding indices.
        """
        self.sentences = sentences   # Store the list of sentences
        self.xpos_tags = xpos_tags   # Store the list of XPOS tag sequences
        self.word_to_idx = word_to_idx  # Store the word-to-index mapping
        self.xpos_to_idx = xpos_to_idx    # Store the XPOS tag-to-index mapping

    def __len__(self):
        """
        Returns the total number of sentences in the dataset.
        """
        return len(self.sentences)

    def __getitem__(self, idx):
        """
        Retrieves a sample from the dataset at the specified index.

        Args:
        - idx (int): Index of the sample to retrieve.

        Returns:
        - word_indices (list): List of integer indices representing words in the sentence.
        - xpos_indices (list): List of integer indices representing XPOS tags in the sentence.
        """
        # Retrieve the list of word indices for the sentence at the given index
        word_indices = [self.word_to_idx[word] for word in self.sentences[idx]]
        # Retrieve the list of XPOS tag indices for the sentence at the given index
        xpos_indices = [self.xpos_to_idx[tag] for tag in self.xpos_tags[idx]]
        # Return the word indices and XPOS tag indices as a tuple
        return word_indices, xpos_indices

In [None]:
# Create the dataset
dataset_train = CustomDataset(X_train, Y_train, word_to_idx, xpos_to_idx)

In [None]:
dataset_val = CustomDataset(X_valid, Y_valid, word_to_idx, xpos_to_idx)

## 3. Embedding

In [None]:
def collate_fn(batch):
    # Separate word indices and XPOS tag indices
    word_indices, xpos_indices = zip(*batch)

    # Pad sequences to the same length within each batch
    # Convert each sequence of word indices to a PyTorch tensor and pad them
    padded_word_indices = pad_sequence([torch.tensor(seq) for seq in word_indices], batch_first=True)
    # Convert each sequence of XPOS tag indices to a PyTorch tensor and pad them
    padded_xpos_indices = pad_sequence([torch.tensor(seq) for seq in xpos_indices], batch_first=True)

    # Return padded sequences of word indices and XPOS tag indices
    return padded_word_indices, padded_xpos_indices

In [None]:
# Create DataLoader

# Define the batch size
batch_size = 32

# Create a DataLoader for the training dataset
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Create a DataLoader for the validation dataset
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

## 4. Model building

In [None]:
# Define the RNN model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initializes the RNNClassifier module.

        Args:
        - input_size (int): Size of the input vocabulary.
        - hidden_size (int): Size of the hidden state of the RNN.
        - output_size (int): Size of the output (number of classes).
        """
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        """
        Forward pass of the RNNClassifier module.

        Args:
        - x (Tensor): Input tensor representing input sequences.

        Returns:
        - output (Tensor): Output tensor representing the class predictions.
        """
        # Embed the input sequences
        embedded = self.embedding(x)
        # Pass the embedded sequences through the RNN layer
        output, hidden = self.rnn(embedded)
        # Pass the RNN output through the fully connected layer
        output = self.fc(output)
        return output

In [None]:
# Define hyperparameters
vocab_size = len(word_to_idx)
embedding_dim = 100
num_pos_tags = len(xpos_to_idx)

input_size = vocab_size
hidden_size = 128
output_size = len(xpos_to_idx)

# For GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create the RNN model
rnn_model = RNNClassifier(input_size, hidden_size, output_size).to(device)

## 5. Model training

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

In [None]:
# Define the number of epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    # Set the model to train mode
    rnn_model.train()

    # Initialize variables to track training loss and accuracy
    correct_train = 0
    total_train = 0

    # Iterate over the training dataset
    for inputs, targets in dataloader_train:
        # Move inputs and targets to the device
        inputs, targets = inputs.to(device), targets.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = rnn_model(inputs)

        # Calculate the loss
        loss = criterion(outputs.view(-1, output_size), targets.view(-1))

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()


        # Calculate the number of correctly predicted samples
        _, predicted = torch.max(outputs, 2)
        correct_train += (predicted == targets).sum().item()
        total_train += targets.numel()

    # Calculate training accuracy
    train_accuracy = correct_train / total_train

    # Print training loss and accuracy for the current epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {loss.item():.4f}, Training Accuracy: {train_accuracy:.4f}")

    # Validation loop
    rnn_model.eval()  # Set the model to evaluation mode
    correct_val = 0
    total_val = 0

    # Disable gradient calculation to save memory and computation
    with torch.no_grad():
        for val_inputs, val_targets in dataloader_val:
            val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
            val_outputs = rnn_model(val_inputs)
            _, val_predicted = torch.max(val_outputs, 2)
            correct_val += (val_predicted == val_targets).sum().item()
            total_val += val_targets.numel()

    # Calculate validation accuracy
    val_accuracy = correct_val / total_val

    # Print validation accuracy for the current epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {val_accuracy:.4f}")


Epoch 1/10, Training Loss: 0.1908, Training Accuracy: 0.9628
Epoch 1/10, Validation Accuracy: 0.9103
Epoch 2/10, Training Loss: 0.1497, Training Accuracy: 0.9688
Epoch 2/10, Validation Accuracy: 0.9126
Epoch 3/10, Training Loss: 0.1346, Training Accuracy: 0.9740
Epoch 3/10, Validation Accuracy: 0.9142
Epoch 4/10, Training Loss: 0.1065, Training Accuracy: 0.9786
Epoch 4/10, Validation Accuracy: 0.9123
Epoch 5/10, Training Loss: 0.0966, Training Accuracy: 0.9823
Epoch 5/10, Validation Accuracy: 0.9155
Epoch 6/10, Training Loss: 0.0670, Training Accuracy: 0.9853
Epoch 6/10, Validation Accuracy: 0.9138
Epoch 7/10, Training Loss: 0.0280, Training Accuracy: 0.9881
Epoch 7/10, Validation Accuracy: 0.9147
Epoch 8/10, Training Loss: 0.0248, Training Accuracy: 0.9902
Epoch 8/10, Validation Accuracy: 0.9146
Epoch 9/10, Training Loss: 0.0333, Training Accuracy: 0.9918
Epoch 9/10, Validation Accuracy: 0.9126
Epoch 10/10, Training Loss: 0.0242, Training Accuracy: 0.9933
Epoch 10/10, Validation Accur

In [None]:
# Saving the model along with state
torch.save(rnn_model, '/content/drive/MyDrive/NLP2024/entire_rnn_model.h5')