In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpyiligslw', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-28 11:13:21.764567: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 11:13:21.795613: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'AMR'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "apparel_MR",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "apparel",
            "target_domain": "MR",
            "domain_adapter_name": "mlm_unipelt_mr",
            "task_adapter_name": "AMRPelt",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-AMRPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="AMRPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.484375
val/f1: 0.5412695407867432
val/taskclf_loss: 1.0719757080078125
val/loss: 1.3936331272125244
val/mlm_loss: 1.7152904272079468


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8882285356521606
val/taskclf_loss: 0.33478403091430664
val/loss: 1.1802994012832642
val/mlm_loss: 2.0258147716522217


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9129858016967773
val/taskclf_loss: 0.17899279296398163
val/loss: 1.1893223524093628
val/mlm_loss: 2.1996519565582275


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9316940307617188
val/taskclf_loss: 0.15556882321834564
val/loss: 1.0287479162216187
val/mlm_loss: 1.901926875114441


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9438466429710388
val/taskclf_loss: 0.140717551112175
val/loss: 0.9599308967590332
val/mlm_loss: 1.779144287109375


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9437729120254517
val/taskclf_loss: 0.14123235642910004
val/loss: 1.1216353178024292
val/mlm_loss: 2.1020381450653076


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.950116753578186
val/taskclf_loss: 0.13574042916297913
val/loss: 1.1715137958526611
val/mlm_loss: 2.207286834716797


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9377273917198181
val/taskclf_loss: 0.1384972780942917
val/loss: 1.0716794729232788
val/mlm_loss: 2.00486159324646


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.950116753578186
val/taskclf_loss: 0.13773415982723236
val/loss: 1.129515528678894
val/mlm_loss: 2.1212971210479736


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9438850283622742
val/taskclf_loss: 0.13660097122192383
val/loss: 1.0776180028915405
val/mlm_loss: 2.018634796142578


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9437500238418579
val/f1: 0.9438850283622742
val/taskclf_loss: 0.13636060059070587
val/loss: 1.0768753290176392
val/mlm_loss: 2.017390012741089


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_3/checkpoints/task-AMRPelt-epoch=03-val_loss=0.96.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_3/checkpoints/AMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2695879638195038, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.9227488040924072, 'source_test/f1_macro': 0.9203598499298096, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.5610986351966858, 'target_test/accuracy': 0.7884615659713745, 'target_test/f1': 0.7879430651664734, 'target_test/f1_macro': 0.7853087782859802, 'target_test/f1_micro': 0.7884615659713745}]
Best checkpoint path: checkpoints/lightning_logs/version_3/checkpoints/task-AMRPelt-epoch=03-val_loss=0.96.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_3/checkpoints/AMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.25644633173942566, 'source_test/accuracy': 0.9062500596046448, 'source_test/f1': 0.9062647819519043, 'source_test/f1_macro': 0.9021102786064148, 'source_test/f1_micro': 0.9062500596046448, 'target_test/loss': 0.5396648645401001, 'target_test/accuracy': 0.7764423489570618, 'target_test/f1': 0.7759681344032288, 'target_test/f1_macro': 0.7730091214179993, 'target_test/f1_micro': 0.7764423489570618}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.26000887155532837, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9155843257904053, 'source_test/f1_macro': 0.9125994443893433, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.5268170237541199, 'target_test/accuracy': 0.7884615659713745, 'target_test/f1': 0.7879072427749634, 'target_test/f1_macro': 0.7852774858474731, 'target_test/f1_micro': 0.7884615659713745}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.0
val/f1: 0.0
val/taskclf_loss: 1.194032073020935
val/loss: 1.4591574668884277
val/mlm_loss: 1.72428297996521


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.9005439877510071
val/taskclf_loss: 0.3888017535209656
val/loss: 1.2191671133041382
val/mlm_loss: 2.049532651901245


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9192051291465759
val/taskclf_loss: 0.18698467314243317
val/loss: 1.081969141960144
val/mlm_loss: 1.9769537448883057


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9378021359443665
val/taskclf_loss: 0.1590617150068283
val/loss: 1.254151701927185
val/mlm_loss: 2.3492414951324463


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9437606930732727
val/taskclf_loss: 0.14439456164836884
val/loss: 1.2097856998443604
val/mlm_loss: 2.275177001953125


Validation: |                                                                                                 …

val/accuracy: 0.956250011920929
val/f1: 0.95638507604599
val/taskclf_loss: 0.1336890459060669
val/loss: 1.0049103498458862
val/mlm_loss: 1.8761314153671265


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.950116753578186
val/taskclf_loss: 0.12670619785785675
val/loss: 1.1963967084884644
val/mlm_loss: 2.266087293624878


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9439589381217957
val/taskclf_loss: 0.14002759754657745
val/loss: 1.0216174125671387
val/mlm_loss: 1.9032071828842163


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9377273917198181
val/taskclf_loss: 0.1347585767507553
val/loss: 1.073745608329773
val/mlm_loss: 2.0127322673797607


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9377273917198181
val/taskclf_loss: 0.13322030007839203
val/loss: 1.1038262844085693
val/mlm_loss: 2.074432134628296


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9375
val/f1: 0.9377273917198181
val/taskclf_loss: 0.13210125267505646
val/loss: 1.0972038507461548
val/mlm_loss: 2.0623064041137695


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_4/checkpoints/task-AMRPelt-epoch=04-val_loss=1.00.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_4/checkpoints/AMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.26267334818840027, 'source_test/accuracy': 0.920673131942749, 'source_test/f1': 0.9203800559043884, 'source_test/f1_macro': 0.9176691174507141, 'source_test/f1_micro': 0.920673131942749, 'target_test/loss': 0.5862994194030762, 'target_test/accuracy': 0.764423131942749, 'target_test/f1': 0.7652538418769836, 'target_test/f1_macro': 0.7599295973777771, 'target_test/f1_micro': 0.764423131942749}]
Best checkpoint path: checkpoints/lightning_logs/version_4/checkpoints/task-AMRPelt-epoch=04-val_loss=1.00.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_4/checkpoints/AMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.25533756613731384, 'source_test/accuracy': 0.9134615659713745, 'source_test/f1': 0.913468599319458, 'source_test/f1_macro': 0.9099774360656738, 'source_test/f1_micro': 0.9134615659713745, 'target_test/loss': 0.5440092086791992, 'target_test/accuracy': 0.7668269276618958, 'target_test/f1': 0.7658529281616211, 'target_test/f1_macro': 0.7641289234161377, 'target_test/f1_micro': 0.7668269276618958}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.2545546591281891, 'source_test/accuracy': 0.9182692766189575, 'source_test/f1': 0.9181017875671387, 'source_test/f1_macro': 0.9152601957321167, 'source_test/f1_micro': 0.9182692766189575, 'target_test/loss': 0.5811285376548767, 'target_test/accuracy': 0.7692307829856873, 'target_test/f1': 0.7684544324874878, 'target_test/f1_macro': 0.7661392092704773, 'target_test/f1_micro': 0.7692307829856873}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.015625
val/f1: 0.004166666883975267
val/taskclf_loss: 1.1434001922607422
val/loss: 1.487693190574646
val/mlm_loss: 1.8319861888885498


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.893750011920929
val/f1: 0.8945522308349609
val/taskclf_loss: 0.2814071774482727
val/loss: 1.2336740493774414
val/mlm_loss: 2.185940742492676


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.925449013710022
val/taskclf_loss: 0.18030188977718353
val/loss: 1.1502095460891724
val/mlm_loss: 2.1201171875


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9258233308792114
val/taskclf_loss: 0.16928274929523468
val/loss: 1.0665727853775024
val/mlm_loss: 1.963862657546997


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9312607049942017
val/taskclf_loss: 0.14396977424621582
val/loss: 1.0730774402618408
val/mlm_loss: 2.002185106277466


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9381540417671204
val/taskclf_loss: 0.14366722106933594
val/loss: 1.155542254447937
val/mlm_loss: 2.167417287826538


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9441332221031189
val/taskclf_loss: 0.13314254581928253
val/loss: 1.0207581520080566
val/mlm_loss: 1.9083738327026367


Validation: |                                                                                                 …

val/accuracy: 0.956250011920929
val/f1: 0.95638507604599
val/taskclf_loss: 0.121415875852108
val/loss: 1.0987532138824463
val/mlm_loss: 2.0760905742645264


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.950103759765625
val/taskclf_loss: 0.1338615119457245
val/loss: 1.0792043209075928
val/mlm_loss: 2.0245471000671387


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.9501904845237732
val/taskclf_loss: 0.1378389149904251
val/loss: 1.1194084882736206
val/mlm_loss: 2.100978136062622


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9375
val/f1: 0.9378021359443665
val/taskclf_loss: 0.13635729253292084
val/loss: 1.0564415454864502
val/mlm_loss: 1.9765255451202393


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_5/checkpoints/task-AMRPelt-epoch=05-val_loss=1.02.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_5/checkpoints/AMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2911957800388336, 'source_test/accuracy': 0.9182692766189575, 'source_test/f1': 0.9179400205612183, 'source_test/f1_macro': 0.915256917476654, 'source_test/f1_micro': 0.9182692766189575, 'target_test/loss': 0.5456898808479309, 'target_test/accuracy': 0.8028846383094788, 'target_test/f1': 0.8035428524017334, 'target_test/f1_macro': 0.7983437776565552, 'target_test/f1_micro': 0.8028846383094788}]
Best checkpoint path: checkpoints/lightning_logs/version_5/checkpoints/task-AMRPelt-epoch=05-val_loss=1.02.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_5/checkpoints/AMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2681191861629486, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9158085584640503, 'source_test/f1_macro': 0.9120540022850037, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.5522655844688416, 'target_test/accuracy': 0.7716346383094788, 'target_test/f1': 0.7718807458877563, 'target_test/f1_macro': 0.7675015926361084, 'target_test/f1_micro': 0.7716346383094788}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.2681191861629486, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9158085584640503, 'source_test/f1_macro': 0.9120540022850037, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.5522655844688416, 'target_test/accuracy': 0.7716346383094788, 'target_test/f1': 0.7718807458877563, 'target_test/f1_macro': 0.7675015926361084, 'target_test/f1_micro': 0.7716346383094788}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.2695879638195038, 0.26267334818840027, 0.2911957800388336], 'source_test/accuracy': [0.9230769872665405, 0.920673131942749, 0.9182692766189575], 'source_test/f1': [0.9227488040924072, 0.9203800559043884, 0.9179400205612183], 'source_test/f1_macro': [0.9203598499298096, 0.9176691174507141, 0.915256917476654], 'source_test/f1_micro': [0.9230769872665405, 0.920673131942749, 0.9182692766189575], 'target_test/loss': [0.5610986351966858, 0.5862994194030762, 0.5456898808479309], 'target_test/accuracy': [0.7884615659713745, 0.764423131942749, 0.8028846383094788], 'target_test/f1': [0.7879430651664734, 0.7652538418769836, 0.8035428524017334], 'target_test/f1_macro': [0.7853087782859802, 0.7599295973777771, 0.7983437776565552], 'target_test/f1_micro': [0.7884615659713745, 0.764423131942749, 0.8028846383094788]}), ('best_model', {'source_test/loss': [0.25644633173942566, 0.25533756613731384, 0.2681191861629486], 'source_test/accuracy': [0.9062500

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.27448569734891254, 'source_test/accuracy': 0.920673131942749, 'source_test/f1': 0.920356293519338, 'source_test/f1_macro': 0.9177619616190592, 'source_test/f1_micro': 0.920673131942749, 'target_test/loss': 0.564362645149231, 'target_test/accuracy': 0.7852564454078674, 'target_test/f1': 0.7855799198150635, 'target_test/f1_macro': 0.7811940511067709, 'target_test/f1_micro': 0.7852564454078674}, 'best_model': {'source_test/loss': 0.25996769467989606, 'source_test/accuracy': 0.9118590156237284, 'source_test/f1': 0.9118473132451376, 'source_test/f1_macro': 0.9080472389856974, 'source_test/f1_micro': 0.9118590156237284, 'target_test/loss': 0.5453132192293803, 'target_test/accuracy': 0.7716346383094788, 'target_test/f1': 0.7712339361508688, 'target_test/f1_macro': 0.7682132124900818, 'target_test/f1_micro': 0.7716346383094788}, 'epoch_saved': {'source_test/loss': 0.260894238948822, 'source_test/accuracy': 0.9166667064030966, 'source_test/f1'

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf