### Data Loading and Tokenization

In [1]:
!pip install uv



In [2]:
# Install requirement libraries, packages
!uv pip install datasets
!uv pip install conllu
!uv pip install torchviz

!uv pip install wandb
!uv pip install ufal.chu-liu-edmonds


[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 123ms[0m[0m
[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 322ms[0m[0m
[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 471ms[0m[0m
[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 186ms[0m[0m
[2mUsing Python 3.11.11 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 257ms[0m[0m


In [3]:
# main.py
import torch
from datasets import load_dataset
from config import DATASET_PATH, BASQUE_DATASET_NAME, ENGLISH_DATASET_NAME, EXPERIMENT_NAME, RELATION_NUM, HIDDEN_DIM, OUTPUT_DIM, SAVED_MODEL_NAME
from data import dataset_reading_and_encoding, print_first_batch, tokenize_and_align_labels, explore_some_data
from models import model_initializing
from utils import count_parameters
from train import train, train_extended_models

device = 'cuda' if torch.cuda.is_available() else 'cpu'


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Load and process datasets
dataset = load_dataset(path=DATASET_PATH, name=ENGLISH_DATASET_NAME, trust_remote_code=True)
data = dataset_reading_and_encoding(dataset)
print_first_batch(data["train"])

README.md: 0.00B [00:00, ?B/s]

universal_dependencies.py: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2077 [00:00<?, ? examples/s]

First Batch:
input_ids shape: torch.Size([32, 200])
attention_mask shape: torch.Size([32, 200])
head shape: torch.Size([32, 200])
deprel_ids shape: torch.Size([32, 200])


In [5]:
# Initialize and train base model
base_model = model_initializing("base", hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, relation_num=RELATION_NUM)
count_parameters(base_model)
base_model = train(base_model, data, EXPERIMENT_NAME, save_model=True)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Total parameters: 279,346,968
Trainable parameters: 29,654,808


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mghta00001[0m ([33mghta00001-university-of-saarland[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/1: 100%|██████████| 392/392 [03:38<00:00,  1.79it/s]


Epoch 1, Training Loss: 1.7614
Epoch 1, Validation UAS: 0.8842, LAS: 0.8471
Unlabeled Attachment Score (UAS) with MST: 0.8241


0,1
UAS_MST,▁
epoch,▁
training_loss,▁
validation_LAS,▁
validation_UAS,▁

0,1
UAS_MST,0.82406
epoch,1.0
training_loss,1.76138
validation_LAS,0.84711
validation_UAS,0.88425


In [5]:
basque_dataset = load_dataset(path=DATASET_PATH, name=BASQUE_DATASET_NAME, trust_remote_code=True)
basque_data = dataset_reading_and_encoding(basque_dataset)
print_first_batch(basque_data["train"])

First Batch:
input_ids shape: torch.Size([32, 117])
attention_mask shape: torch.Size([32, 117])
head shape: torch.Size([32, 117])
deprel_ids shape: torch.Size([32, 117])


In [6]:
# Initialize and train extended model with adapters
pfeiffer_model = train_extended_models(extended_model_name="pfeiffer", experiment_name="Basque_Adapter_Experiment", dataset=basque_data)

Total parameters: 280,536,600
Trainable parameters: 1,189,632


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mghta00001[0m ([33mghta00001-university-of-saarland[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/1: 100%|██████████| 169/169 [01:02<00:00,  2.70it/s]


Epoch 1, Training Loss: 2.3277
Epoch 1, Validation UAS: 0.7218, LAS: 0.6280
Unlabeled Attachment Score (UAS) with MST: 0.6788


0,1
UAS_MST,▁
epoch,▁
training_loss,▁
validation_LAS,▁
validation_UAS,▁

0,1
UAS_MST,0.67879
epoch,1.0
training_loss,2.32774
validation_LAS,0.62801
validation_UAS,0.72177


In [8]:
basque_dataset = load_dataset(path=DATASET_PATH, name=BASQUE_DATASET_NAME, trust_remote_code=True)
basque_data = dataset_reading_and_encoding(basque_dataset)
print_first_batch(basque_data["train"])

First Batch:
input_ids shape: torch.Size([32, 117])
attention_mask shape: torch.Size([32, 117])
head shape: torch.Size([32, 117])
deprel_ids shape: torch.Size([32, 117])


In [9]:
print(f"Number of batches : {len(basque_data['train'])}")
print(f"Number of All train data : {len(basque_dataset['train'])}")


Number of batches : 169
Number of All train data : 5396


In [11]:
hously_model = train_extended_models(extended_model_name="hously", experiment_name="Basque_Adapter_Experiment", dataset=basque_data)


Total parameters: 281,726,232
Trainable parameters: 2,379,264


Epoch 1/1: 100%|██████████| 169/169 [01:08<00:00,  2.48it/s]


Epoch 1, Training Loss: 7.0629
Epoch 1, Validation UAS: 0.2886, LAS: 0.1194
Unlabeled Attachment Score (UAS) with MST: 0.2968


0,1
UAS_MST,▁
epoch,▁
training_loss,▁
validation_LAS,▁
validation_UAS,▁

0,1
UAS_MST,0.2968
epoch,1.0
training_loss,7.06289
validation_LAS,0.1194
validation_UAS,0.28861
