### Data Loading and Tokenization

In [1]:
# Install requirement libraries, packages
!pip install datasets
!pip install conllu
!pip install torchviz

!pip install wandb
!pip install ufal.chu-liu-edmonds


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import os  # For file and directory operations
from typing import Tuple, List  # For type annotations
import sys

import torch  # PyTorch for tensors and GPU support
from datasets import load_dataset, Dataset  # Hugging Face datasets library
from torch.utils.data import DataLoader  # For batching data
from transformers import XLMRobertaTokenizer, XLMRobertaTokenizerFast, XLMRobertaModel  # Hugging Face models and tokenizers
from transformers import AutoTokenizer

import conllu  # Handling CoNLL-U formatted data
from tqdm import tqdm  # For progress bar
import wandb
import numpy as np
from ufal.chu_liu_edmonds import chu_liu_edmonds

import torch.nn.functional as F

from models import InitialModel, ExtendedModelWithPfeiffer, ExtendedModelWithHoulsby

# Set device to GPU if available, otherwise CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:

# Load the fast XLM-Roberta tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

# Dataset variables
DATASET_PATH = "universal-dependencies/universal_dependencies"
# DATASET_NAME = "eu_bdt"
DATASET_NAME = "en_ewt"

# Define project-specific paths and names
PROJECT_NAME = "TokenDependency"  # Name of the wandb project
EXPERIMENT_NAME = "DataDragon2"  # Name of the experiment

ALL_DEPRELS = [
                # these are the default UD dependency relations according to https://universaldependencies.org/u/dep/
                "acl", "acl:relcl", "advcl", "advcl:relcl", "advmod", "advmod:emph", "advmod:lmod", "amod", "appos",
               "aux", "aux:pass", "case", "cc", "cc:preconj", "ccomp", "clf", "compound", "compound:lvc",
               "compound:prt", "compound:redup", "compound:svc", "conj", "cop", "csubj", "csubj:outer",
               "csubj:pass", "dep", "det", "det:numgov", "det:nummod", "det:poss", "discourse", "dislocated",
               "expl", "expl:impers", "expl:pass", "expl:pv", "fixed", "flat", "flat:foreign", "flat:name",
               "goeswith", "iobj", "list", "mark", "nmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubj:outer",
               "nsubj:pass", "nummod", "nummod:gov", "obj", "obl", "obl:agent", "obl:arg", "obl:lmod",
               "obl:tmod", "orphan", "parataxis", "punct", "reparandum", "root", "vocative", "xcomp",

                # we need some more for en_ewt
               "det:predet", "obl:npmod", "nmod:npmod"
               ]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [4]:
# Loading dataset from Huggingface
dataset = load_dataset(path=DATASET_PATH, name=DATASET_NAME, trust_remote_code=True)

# A map from dependency to id (id is literally the index) {key (deprels) : value(indexes)}
deprel_to_id = {deprel: idx for idx, deprel in enumerate(ALL_DEPRELS)}

# A map from id to dependency (id is literally the index) {value(indexes) : key (deprels)}
id_to_deprel = {idx: deprel for idx, deprel in enumerate(ALL_DEPRELS)}

README.md:   0%|          | 0.00/191k [00:00<?, ?B/s]

universal_dependencies.py:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.94M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5396 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1798 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1799 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 5396
    })
    validation: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 1798
    })
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 1799
    })
})

In [6]:
def strip_none_heads(examples, i):
    """
    Filters out tokens with "None" as their head annotation for a specific sentence.

    Args:
        examples (dict): Dataset containing 'tokens', 'head', and 'deprel' fields.
        i (int): Index of the sentence to process.

    Returns:
        zip: Iterator containing filtered tokens, heads, and dependency relations.
    """
    tokens = examples["tokens"][i]  # Extract tokens for the sentence.
    heads = examples["head"][i]  # Extract head annotations.
    deprels = examples["deprel"][i]  # Extract dependency relations.

    # Keep only entries where the head is not "None".
    non_none = [(t, h, d) for t, h, d in zip(tokens, heads, deprels) if h != "None"]
    return zip(*non_none)


def map_first_occurrence(nums):
    """
    Maps each unique number in a list to the index of its first occurrence.

    Args:
        nums (list[int]): List of numbers (can include `None`).

    Returns:
        dict: Mapping from unique numbers to their first occurrence indices.
    """
    seen = set()  # To track seen numbers.
    return {num: i for i, num in enumerate(nums) if num is not None and num not in seen and not seen.add(num)}


def pad_to_same_size(lists, padding_symbol):
    """
    Pads a collection of lists to the same length using a specified padding symbol.

    Args:
        lists (list[list]): Collection of lists to be padded.
        padding_symbol (any): Symbol used for padding.

    Returns:
        list[list]: Padded lists with uniform length.
    """
    maxlen = max([len(l) for l in lists])  # Find the maximum length of lists.
    return [l + (padding_symbol,) * (maxlen - len(l)) for l in lists]


def tokenize_and_align_labels(examples, skip_index=-100):
    """
    Tokenizes sentences and aligns dependency parsing labels with tokenized outputs.

    Args:
        examples (dict): Dataset containing 'tokens', 'head', and 'deprel' fields.
        skip_index (int): Value for ignoring tokens in loss calculations.

    Returns:
        dict: Tokenized inputs with aligned dependency parsing labels and additional metadata.
    """
    # Initialize lists for filtered tokens, heads, and dependency relations.
    examples_tokens, examples_heads, examples_deprels = [], [], []

    # Remove "None" heads and their annotations for each sentence.
    for sentence_id in range(len(examples["tokens"])):
        tt, hh, dd = strip_none_heads(examples, sentence_id)
        examples_tokens.append(tt)
        examples_heads.append(hh)
        examples_deprels.append(dd)

    # Tokenize the inputs using a pre-defined tokenizer.
    tokenized_inputs = tokenizer(
        examples_tokens, truncation=True, is_split_into_words=True, padding=True
    )

    remapped_heads, deprel_ids, tokens_representing_words, num_words = [], [], [], []
    maxlen_t2w = 0  # Track the max length for token-to-word mappings.

    for sentence_id, annotated_heads in enumerate(examples_heads):
        deprels = examples_deprels[sentence_id]
        word_ids = tokenized_inputs.word_ids(batch_index=sentence_id)

        # Map word positions to the first token of each word.
        word_pos_to_token_pos = map_first_occurrence(word_ids)

        previous_word_idx = None
        heads_here, deprel_ids_here, tokens_representing_word_here = [], [], [0]

        # Iterate over tokens to align labels.
        for sentence_position, word_idx in enumerate(word_ids):
            if word_idx is None:  # Special tokens (e.g., BOS, EOS).
                heads_here.append(skip_index)
                deprel_ids_here.append(skip_index)
            elif word_idx != previous_word_idx:  # First token of a new word.
                if annotated_heads[word_idx] == "None":
                    print("A 'None' head survived!")
                    sys.exit(0)

                # Map head word position to its first token.
                head_word_pos = int(annotated_heads[word_idx])
                head_token_pos = 0 if head_word_pos == 0 else word_pos_to_token_pos[head_word_pos - 1]

                heads_here.append(head_token_pos)
                deprel_ids_here.append(deprel_to_id[deprels[word_idx]])
                tokens_representing_word_here.append(sentence_position)
            else:  # Other tokens in the same word.
                heads_here.append(skip_index)
                deprel_ids_here.append(skip_index)

            previous_word_idx = word_idx

        remapped_heads.append(heads_here)
        deprel_ids.append(deprel_ids_here)
        tokens_representing_words.append(tokens_representing_word_here)
        num_words.append(len(tokens_representing_word_here))
        maxlen_t2w = max(maxlen_t2w, len(tokens_representing_word_here))

    # Pad token-to-word mappings to the same length.
    for t2w in tokens_representing_words:
        t2w += [-1] * (maxlen_t2w - len(t2w))

    # Add aligned labels and metadata to tokenized inputs.
    tokenized_inputs["head"] = remapped_heads
    tokenized_inputs["deprel_ids"] = deprel_ids
    tokenized_inputs["tokens_representing_words"] = tokens_representing_words
    tokenized_inputs["num_words"] = num_words
    tokenized_inputs["tokenid_to_wordid"] = [
        tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples_heads))
    ]

    return tokenized_inputs


In [7]:
def explore_some_data(dataset, tokenized_inputs):
    """
    Explores tokenized inputs and prints RoBERTa tokens along with their heads,
    human-readable dependency relations (deprels), and mapping between words and tokens.

    Args:
        dataset (list[dict]): The dataset containing the original sentence tokens.
        tokenized_inputs (dict): Tokenized inputs with aligned labels and mappings.

    Returns:
        None: Prints information for each token in the first ten sentences.
    """

    # Extract input IDs for tokenized sentences
    input_ids = tokenized_inputs["input_ids"]

    # Convert token IDs to their corresponding RoBERTa tokens
    sentences_tokens_from_input_id = [tokenizer.convert_ids_to_tokens(input_ids[i]) for i in range(len(input_ids))]

    # Retrieve deprel IDs and map them to human-readable labels
    deprel_ids = tokenized_inputs["deprel_ids"]
    deprel_value_by_id = []
    for ids in deprel_ids:
        deprel_value_by_id.append([id_to_deprel.get(k, "None") for k in ids])

    # Extract head information and token-to-word mappings
    heads = tokenized_inputs["head"]
    tokenid_to_wordid = tokenized_inputs["tokenid_to_wordid"]

    # Map heads to their corresponding word IDs
    heads_words_list_ids = []
    for i in range(len(heads)):
        head = heads[i]
        token_inputs = tokenid_to_wordid[i]
        head_word_ids = [
            token_inputs[h] if h >= 0 else None  # Convert token head position to word head position
            for h in head
        ]
        heads_words_list_ids.append(head_word_ids)

    # Map head word IDs to their corresponding word tokens
    heads_words = []
    for i in range(len(heads_words_list_ids)):
        word_id_list = heads_words_list_ids[i]
        sentence_tokens = dataset[i]["tokens"]
        heads_words.append(
            [sentence_tokens[idx] if idx else None for idx in word_id_list]
        )

    # Map tokens to words for better understanding of token-to-word alignment
    token_mapping_to_words = []
    for i in range(len(sentences_tokens_from_input_id)):
        raw_sentence = dataset[i]["tokens"]
        tokenized_sentence = sentences_tokens_from_input_id[i]
        token_to_word_ids = tokenid_to_wordid[i]

        temp_token_to_word = {}
        for token_idx, token in enumerate(tokenized_sentence):
            word_id = token_to_word_ids[token_idx]
            if word_id:
                temp_token_to_word[token] = raw_sentence[word_id]
                continue
            temp_token_to_word[token] = None
        token_mapping_to_words.append(temp_token_to_word)

    # Extract all tokenized tokens and their corresponding words
    all_tokenized_tokens = [list(temp.keys()) for temp in token_mapping_to_words]
    all_raw_tokens = [list(temp.values()) for temp in token_mapping_to_words]

    # Print RoBERTa tokens along with head, deprel, and word mappings for each sentence
    for i in range(len(tokenized_inputs)):
        tokenized_tokens = all_tokenized_tokens[i]
        head = heads_words[i]
        deprel = deprel_value_by_id[i]
        words = all_raw_tokens[i]
        for j in range(len(words)):
            print(f"Token : {tokenized_tokens[j]:<10} -> Head: {str(head[j]) if head[j] is not None else 'N/A':<10} -> Deprel: {str(deprel[j]) if deprel[j] is not None else 'N/A':<10} -> Word: {str(words[j]) if words[j] is not None else 'N/A'}")
        print("----------------------------------------")


In [8]:
sample_tokenized_inputs = tokenize_and_align_labels(dataset["train"][:10])
explore_some_data(dataset["train"], sample_tokenized_inputs)

Token : <s>        -> Head: N/A        -> Deprel: None       -> Word: N/A
Token : ▁Gero      -> Head: jarri      -> Deprel: advmod     -> Word: N/A
Token : ▁          -> Head: masa       -> Deprel: punct      -> Word: .
Token : ,          -> Head: N/A        -> Deprel: None       -> Word: ,
Token : ▁lor       -> Head: masa       -> Deprel: advcl      -> Word: lortutako
Token : tutako     -> Head: N/A        -> Deprel: None       -> Word: lortutako
Token : ▁masa      -> Head: jarri      -> Deprel: obj        -> Word: masa
Token : ▁molde     -> Head: jarri      -> Deprel: obl        -> Word: molde
Token : ▁batean    -> Head: molde      -> Deprel: nummod     -> Word: batean
Token : ▁jarri     -> Head: N/A        -> Deprel: root       -> Word: jarri
Token : .          -> Head: jarri      -> Deprel: punct      -> Word: .
Token : </s>       -> Head: N/A        -> Deprel: None       -> Word: N/A
Token : <pad>      -> Head: N/A        -> Deprel: None       -> Word: N/A
------------------------

In [9]:
def dataset_reading_and_encoding(dataset):
    """
    This function processes a dataset of sentences and their dependency annotations,
    tokenizes them using the RoBERTa tokenizer, and formats the data for PyTorch DataLoaders.

    Args:
        dataset (dict): Dictionary containing different splits of the dataset (e.g., 'train', 'validation', 'test').

    Returns:
        dict: A dictionary where keys are dataset types ('train', 'validation', 'test')
              and values are DataLoaders containing tokenized and formatted data.
    """
    encoded_dataset_loader = {}

    for dataset_type, data in dataset.items():
        # Determine the number of sentences to use for each split
        if dataset_type == "test":
            index = 2000  # Use the first 2000 sentences for the test set
        if dataset_type == "train":
            index = 12500  # Use the first 12,500 sentences for the training set
        if dataset_type == "validation":
            index = 2000  # Use the first 2000 sentences for the validation set

        # Tokenize and align labels for the selected subset of sentences
        encoded_data = tokenize_and_align_labels(data[:index])

        # Prepare the tokenized data for PyTorch
        formatted_data = []
        for i in range(len(encoded_data["input_ids"])):
            # Convert each field in the tokenized data to PyTorch tensors
            formatted_data.append({
                "input_ids": torch.tensor(encoded_data["input_ids"][i], dtype=torch.long),  # Token IDs
                "attention_mask": torch.tensor(encoded_data["attention_mask"][i], dtype=torch.long),  # Attention mask
                "head": torch.tensor(encoded_data["head"][i], dtype=torch.long),  # Dependency heads
                "deprel_ids": torch.tensor(encoded_data["deprel_ids"][i], dtype=torch.long),  # Dependency relations
                "tokens_representing_words": torch.tensor(encoded_data["tokens_representing_words"][i], dtype=torch.long),
                "num_words": torch.tensor(encoded_data["num_words"][i], dtype=torch.long)

            })

        # Create a DataLoader for the tokenized data
        dataloader = DataLoader(formatted_data, batch_size=32, shuffle=True)
        # Add the DataLoader to the encoded dataset loader for the current split
        encoded_dataset_loader[dataset_type] = dataloader

    # Return the dictionary containing DataLoaders for all dataset splits
    return encoded_dataset_loader


Here we initialize the dataset and create the dataloaders, then print the first batch of trainset

In [10]:
def print_first_batch(dataloader):
    for batch in dataloader:
        print("First Batch:")
        print("input_ids shape:", batch["input_ids"].shape)
        print("attention_mask shape:", batch["attention_mask"].shape)
        print("head shape:", batch["head"].shape)
        print("deprel_ids shape:", batch["deprel_ids"].shape)
        break

# Initialize dataset and encoding
data = dataset_reading_and_encoding(dataset)
print_first_batch(data["train"])


First Batch:
input_ids shape: torch.Size([32, 117])
attention_mask shape: torch.Size([32, 117])
head shape: torch.Size([32, 117])
deprel_ids shape: torch.Size([32, 117])


### Neural Network Model

### Initialize model

In [21]:
# Model parameters
HIDDEN_DIM = 768
OUTPUT_DIM = 256  # Dimensionality of head and dependent representations
RELATION_NUM = len(ALL_DEPRELS)
SKIP_INDEX = -100
EPOCHS = 3
lr = 1e-4
BATCH_SIZE = 32
BASE_MODEL_NAME = "xlm-roberta-base"


In [12]:
class MyModel(torch.nn.Module):
    def __init__(self, HIDDEN_DIM, OUTPUT_DIM) -> None:

        super().__init__()

        # Load pretrained XLM-RoBERTa model
        self.xlm_roberta = XLMRobertaModel.from_pretrained(BASE_MODEL_NAME)

        # Define linear layers for head and dependent token representations
        self.h_head = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM)  # Linear layer for head tokens
        self.relu1 = torch.nn.ReLU()  # ReLU activation for head tokens
        self.h_dep = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM)  # Linear layer for dependent tokens
        self.relu2 = torch.nn.ReLU()  # ReLU activation for dependent tokens

        # Define parameters for the bilinear scoring function
        self.U1 = torch.nn.Parameter(torch.empty(OUTPUT_DIM, OUTPUT_DIM))  # Weight matrix
        self.u2 = torch.nn.Parameter(torch.empty(OUTPUT_DIM))  # Bias term

        # Initialize parameters using Xavier uniform and zero initialization
        torch.nn.init.xavier_uniform_(self.U1)
        torch.nn.init.zeros_(self.u2)

        # Freeze all layers of XLM-RoBERTa by default
        for param in self.xlm_roberta.parameters():
            param.requires_grad = False

        # Unfreeze the last N layers of XLM-RoBERTa for fine-tuning
        N = 4  # Number of layers to unfreeze
        total_layers = len(self.xlm_roberta.encoder.layer)
        for i in range(total_layers - N, total_layers):  # Unfreezing the last N layers of the encoder
            for name, param in self.xlm_roberta.encoder.layer[i].named_parameters():
                param.requires_grad = True  # Enable gradient updates for these layers

    def forward(self, input_ids, attention_mask):
        # Forward pass through XLM-RoBERTa
        roberta_output = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the hidden states from the last layer of the encoder
        hidden_layer = roberta_output.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)

        # Compute the head and dependent token representations
        H_head = self.h_head(hidden_layer)  # Shape: (batch_size, seq_len, output_dim)
        head_relu = self.relu1(H_head)  # Apply ReLU activation to head representations

        H_dep = self.h_dep(hidden_layer)  # Shape: (batch_size, seq_len, output_dim)
        dep_relu = self.relu2(H_dep)  # Apply ReLU activation to dependent representations

        # Compute the bilinear scores for token pair dependencies
        score = torch.matmul(
            torch.matmul(head_relu, self.U1), dep_relu.transpose(1, 2)  # Shape: (batch_size, seq_len, seq_len)
        )
        # Add a linear term for the scores
        score += torch.matmul(head_relu, self.u2).unsqueeze(-1)  # Shape: (batch_size, seq_len, seq_len)

        # Return the raw scores for dependencies
        return score


In [13]:

# Initialize the custom model
model = InitialModel(HIDDEN_DIM, OUTPUT_DIM, RELATION_NUM)
model.to(device)


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

InitialModel(
  (xlm_roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
   

### Count number of parameteres

In [14]:
### Count Parameters
def count_parameters(model):
    """
    Counts the total and trainable parameters in a given model.

    Args:
        model (torch.nn.Module): The PyTorch model whose parameters are to be counted.

    Returns:
        tuple: A tuple containing:
            - total_params (int): Total number of parameters in the model.
            - trainable_params (int): Number of trainable parameters in the model.
    """
    # Calculate the total number of parameters
    total_params = sum(p.numel() for p in model.parameters())

    # Calculate the number of trainable parameters (requires_grad=True)
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Print the counts in a readable format
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")

    return total_params, trainable_params



In [15]:
count_parameters(model)

Total parameters: 279,346,968
Trainable parameters: 29,654,808


(279346968, 29654808)

In [16]:
def evaluate(model, dataloader):
    """
    Evaluate the model on a dataset, computing UAS and LAS.

    Args:
        model (torch.nn.Module): The dependency parsing model.
        dataloader (torch.utils.data.DataLoader): DataLoader for validation/test data.

    Returns:
        dict: Dictionary containing 'UAS' (unlabeled attachment score) and 'LAS' (labeled attachment score).
    """
    model.eval()
    total_tokens = 0
    correct_heads = 0
    correct_heads_and_rels = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            head_labels = batch["head"].to(device)      # True head indices
            deprel_ids = batch["deprel_ids"].to(device) # True relation labels

            # Forward pass
            outputs = model(input_ids, attention_mask)
            arc_scores = outputs['arc_scores']  # Shape: (batch_size, seq_len, seq_len)
            rel_scores = outputs['rel_scores']  # Shape: (batch_size, seq_len, seq_len, num_relations)

            # Predict heads
            predicted_heads = torch.argmax(arc_scores, dim=2)  # Shape: (batch_size, seq_len)

            # Predict relations using predicted heads
            batch_size, seq_len = predicted_heads.shape
            batch_idx = torch.arange(batch_size)[:, None].expand(-1, seq_len).to(device)
            dep_idx = torch.arange(seq_len)[None, :].expand(batch_size, -1).to(device)
            rel_scores_for_predicted_heads = rel_scores[batch_idx, dep_idx, predicted_heads, :]  # Shape: (batch_size, seq_len, num_relations)
            predicted_rels = torch.argmax(rel_scores_for_predicted_heads, dim=2)  # Shape: (batch_size, seq_len)

            # Mask for valid tokens
            mask = head_labels != -100

            # Compute UAS
            correct_heads += torch.sum((predicted_heads == head_labels) & mask).item()

            # Compute LAS
            correct_heads_and_rels += torch.sum(
                (predicted_heads == head_labels) & (predicted_rels == deprel_ids) & mask
            ).item()

            # Total valid tokens
            total_tokens += torch.sum(mask).item()

    uas = correct_heads / total_tokens if total_tokens > 0 else 0.0
    las = correct_heads_and_rels / total_tokens if total_tokens > 0 else 0.0
    return {'UAS': uas, 'LAS': las}

The explanation is done in comments on start_wandb

In [17]:
def start_wandb(wandb_config=None):
    if not wandb_config:
        wandb_config = {
            "learning_rate": lr,
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "model": BASE_MODEL_NAME,
            "output_dim": OUTPUT_DIM,
            "num_relations": RELATION_NUM  # Added
        }

    wandb.init(
        project=PROJECT_NAME,
        name=EXPERIMENT_NAME,
        config=wandb_config
    )
    wandb.watch(model, log="all")

In [18]:
def mst_parsing(model, dataloader, device):
    """
    Perform MST parsing using the Chu-Liu-Edmonds algorithm on the model's predictions.

    Args:
        model (torch.nn.Module): The trained dependency parsing model.
        dataloader (DataLoader): DataLoader containing the dataset to evaluate.
        device (str): Device to perform computations on ('cuda' or 'cpu').

    Returns:
        float: Unlabeled Attachment Score (UAS) for the dataset.
    """
    model.eval()  # Set the model to evaluation mode
    total_tokens = 0  # Total number of valid tokens
    correct_tokens = 0  # Total number of correctly predicted heads

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            # Move input tensors to the appropriate device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            head_labels = batch["head"].to(device)  # True head indices (batch_size, seq_len)
            tokens_representing_words = batch["tokens_representing_words"].to(device)  # Indices of first word tokens

            # Forward pass: compute scores (batch_size, seq_len, seq_len)
            scores = model(input_ids, attention_mask)

            # Apply log_softmax to convert scores to log probabilities
            scores_dict = model(input_ids, attention_mask)
            scores = scores_dict["arc_scores"]  # shape: (batch_size, seq_len, seq_len)

            log_probs = torch.log_softmax(scores, dim=2).cpu().numpy()

            # Iterate over each sentence in the batch
            for i in range(input_ids.shape[0]):
                # Get the indices of the first tokens for each word
                word_token_indices = tokens_representing_words[i].cpu().numpy()
                word_token_indices = word_token_indices[word_token_indices != -1]  # Remove padding

                # Filter the log probabilities to include only the first tokens of words
                filtered_log_probs = log_probs[i][word_token_indices][:, word_token_indices]

                # Convert the filtered log probabilities to double (float64)
                filtered_log_probs = filtered_log_probs.astype(np.float64)

                # Run the Chu-Liu-Edmonds algorithm to find the MST
                heads, temp = chu_liu_edmonds(filtered_log_probs)

                # Map the predicted heads back to the original token indices
                predicted_heads = word_token_indices[heads]

                # Compare the predicted heads with the gold-standard heads
                gold_heads = head_labels[i].cpu().numpy()
                gold_heads = gold_heads[word_token_indices]  # Filter gold heads to match word tokens

                # Count correct predictions
                correct_tokens += np.sum(predicted_heads == gold_heads)
                total_tokens += len(gold_heads)

    # Compute the Unlabeled Attachment Score (UAS)
    uas = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    return uas

  # Evaluate the model on the validation set using MST parsing


In [23]:

# Initialize Weights & Biases (wandb) for experiment tracking
start_wandb()

# Lists to store training loss and validation accuracy for each epoch
val_accuracies = []
train_losses = []

# Fetch the dataloaders for training and validation sets
train_dataloader = data["train"]
val_loader = data["validation"]

# Define the loss function for multi-class classification
# Define loss functions for multi-class classification
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=SKIP_INDEX)  # For arc prediction
loss_fn_rel = torch.nn.CrossEntropyLoss(ignore_index=SKIP_INDEX)  # For relation prediction

# Define the optimizer with all relevant model parameters
optimizer = torch.optim.Adam([
    {'params': model.h_head.parameters()},        # Arc head linear layer
    {'params': model.h_dep.parameters()},         # Arc dependent linear layer
    {'params': model.U1},                         # Arc scoring matrix
    {'params': model.u2},                         # Arc scoring vector
    {'params': model.h_head_rel.parameters()},    # Relation head linear layer
    {'params': model.h_dep_rel.parameters()},     # Relation dependent linear layer
    {'params': model.U_rel},                      # Relation scoring tensor
    {'params': model.xlm_roberta.encoder.layer[-4:].parameters()}  # Last 4 encoder layers
], lr=1e-4)  # Learning rate

# Main training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss_tensor = torch.zeros(1, device=device)

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        # Move batch data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        head_labels = batch['head'].to(device)        # Ground-truth head indices
        deprel_ids = batch['deprel_ids'].to(device)   # Ground-truth relation labels

        # Forward pass
        outputs = model(input_ids, attention_mask)
        arc_scores = outputs['arc_scores']  # Shape: (batch_size, seq_len, seq_len)
        rel_scores = outputs['rel_scores']  # Shape: (batch_size, seq_len, seq_len, num_relations)

        # Compute arc loss
        loss_arc = loss_fn(arc_scores.view(-1, arc_scores.shape[-1]), head_labels.view(-1))

        # Compute relation loss (using scores for true heads)
        batch_size, seq_len = head_labels.shape
        batch_idx = torch.arange(batch_size)[:, None].expand(-1, seq_len).to(device)
        dep_idx = torch.arange(seq_len)[None, :].expand(batch_size, -1).to(device)
        rel_scores_for_true_heads = rel_scores[batch_idx, dep_idx, head_labels, :]  # Shape: (batch_size, seq_len, num_relations)
        loss_rel = loss_fn_rel(rel_scores_for_true_heads.view(-1, RELATION_NUM), deprel_ids.view(-1))

        # Total loss
        loss = loss_arc + loss_rel
        epoch_loss_tensor += loss.detach()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Compute average training loss
    epoch_loss = epoch_loss_tensor.item()
    avg_loss = epoch_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_loss:.4f}")

    # Validation
    end = epoch == (EPOCHS - 1)
    metrics = evaluate(model, val_loader)  # Updated evaluate function returns a dict
    val_uas = metrics['UAS']
    val_las = metrics['LAS']
    val_accuracies.append(val_uas)  # For backward compatibility, store UAS
    print(f"Epoch {epoch + 1}, Validation UAS: {val_uas:.4f}, LAS: {val_las:.4f}")

    uas_mst = mst_parsing(model, val_loader, device)  # Still computes UAS only
    print(f"Unlabeled Attachment Score (UAS) with MST: {uas_mst:.4f}")

    # Save model checkpoint every 10 epochs
    try:
        if (epoch + 1) % 10 == 0:
            checkpoint_path = f"model_epoch_{epoch + 1}.pt"
            torch.save(model.state_dict(), checkpoint_path)
    except Exception as e:
        print(f"Error saving model: {e}")

    # Log metrics to wandb
    wandb.log({
        "epoch": epoch + 1,
        "training_loss": avg_loss,
        "validation_UAS": val_uas,
        "validation_LAS": val_las,
        "UAS_MST": uas_mst
    })

# Finalize wandb run
wandb.finish()


Epoch 1/3: 100%|██████████| 169/169 [00:56<00:00,  3.01it/s]


Epoch 1, Training Loss: 0.7692
Epoch 1, Validation UAS: 0.8363, LAS: 0.7920
Unlabeled Attachment Score (UAS) with MST: 0.7820


Epoch 2/3: 100%|██████████| 169/169 [01:03<00:00,  2.68it/s]


Epoch 2, Training Loss: 0.6735
Epoch 2, Validation UAS: 0.8385, LAS: 0.7961
Unlabeled Attachment Score (UAS) with MST: 0.7820


Epoch 3/3: 100%|██████████| 169/169 [01:02<00:00,  2.69it/s]


Epoch 3, Training Loss: 0.5917
Epoch 3, Validation UAS: 0.8408, LAS: 0.7992
Unlabeled Attachment Score (UAS) with MST: 0.7843


[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


KeyboardInterrupt: 

### Testing

In [24]:
import os

def test_model(model, test_loader, device, checkpoint_path=None):
    """
    Evaluates the model on the test dataset, optionally loads a checkpoint, and logs the accuracy.

    Args:
        model (torch.nn.Module): The trained PyTorch model.
        test_loader (DataLoader): DataLoader for the test dataset.
        device (str): Device to perform computations on ('cuda' or 'cpu').
        checkpoint_path (str, optional): Path to the saved model checkpoint. If None, runs on the initialized model.

    Returns:
        float: Test accuracy of the model on the test dataset.
    """
    # Check if a checkpoint exists and load it
    if checkpoint_path and os.path.exists(checkpoint_path):
        print(f"Loading model checkpoint from {checkpoint_path}...")
        model.load_state_dict(torch.load(checkpoint_path, map_location=device))

    # Move the model to the appropriate device
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    # Evaluate the model on the test set
    test_accuracy = evaluate(model, test_loader)  # Assume the `evaluate` function is defined elsewhere
    test_uas = metrics['UAS']
    test_las = metrics['LAS']
    print(f"Test UAS: {val_uas:.4f}, LAS: {val_las:.4f}")

    uas = mst_parsing(model, test_loader, device)
    print(f"Unlabeled Attachment Score (UAS): {uas:.4f}")


    return test_accuracy


In [25]:
# Testing model
test_loader = data["test"]
test_acc = test_model(model, test_loader, device)

Test UAS: 0.8408, LAS: 0.7992
Unlabeled Attachment Score (UAS): 0.7942


TEST : 91.75 percent

UAS : percent


VALIDATION :

# NOTE
Comments with LLM

# BIG NOTE

It's like 10.10 and ***** Colab stopped :(((((
I'll send loss and accuracy figures in email.
but you can see results here.
Oh DAMNNNNNNNNNNNNNNNNNNNNNNNNN
