In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmp26cedkdq', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-28 12:58:38.398729: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 12:58:38.430317: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'BAMR'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "baby_MR",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "baby",
            "target_domain": "MR",
            "domain_adapter_name": "mlm_unipelt_mr",
            "task_adapter_name": "BAMRPelt",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-BAMRPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="BAMRPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.515625
val/f1: 0.5169180631637573
val/taskclf_loss: 1.0765581130981445
val/loss: 1.406226396560669
val/mlm_loss: 1.7152904272079468


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.800000011920929
val/f1: 0.8026838302612305
val/taskclf_loss: 0.4750687777996063
val/loss: 1.2729278802871704
val/mlm_loss: 2.020921230316162


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566452264785767
val/taskclf_loss: 0.40714770555496216
val/loss: 1.3301414251327515
val/mlm_loss: 2.195448160171509


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8563640713691711
val/taskclf_loss: 0.3723612129688263
val/loss: 1.16023850440979
val/mlm_loss: 1.8988733291625977


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8624613881111145
val/taskclf_loss: 0.34256085753440857
val/loss: 1.0825382471084595
val/mlm_loss: 1.7762670516967773


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.868506133556366
val/taskclf_loss: 0.3256417214870453
val/loss: 1.2370089292526245
val/mlm_loss: 2.0914158821105957


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8624841570854187
val/taskclf_loss: 0.31423696875572205
val/loss: 1.28337824344635
val/mlm_loss: 2.191948413848877


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.856277585029602
val/taskclf_loss: 0.34646111726760864
val/loss: 1.2015511989593506
val/mlm_loss: 2.0031979084014893


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8812591433525085
val/taskclf_loss: 0.3125077486038208
val/loss: 1.2445614337921143
val/mlm_loss: 2.118361711502075


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8747127652168274
val/taskclf_loss: 0.32951322197914124
val/loss: 1.1968377828598022
val/mlm_loss: 2.0099549293518066


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.893750011920929
val/f1: 0.8935190439224243
val/taskclf_loss: 0.3237035870552063
val/loss: 1.1924651861190796
val/mlm_loss: 2.006929397583008


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-BAMRPelt-epoch=03-val_loss=1.08.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/BAMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.21461506187915802, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.9230082631111145, 'source_test/f1_macro': 0.9174870848655701, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.567164957523346, 'target_test/accuracy': 0.7980769276618958, 'target_test/f1': 0.7990121245384216, 'target_test/f1_macro': 0.7934900522232056, 'target_test/f1_micro': 0.7980769276618958}]
Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-BAMRPelt-epoch=03-val_loss=1.08.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/BAMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.21675506234169006, 'source_test/accuracy': 0.9062500596046448, 'source_test/f1': 0.9061285257339478, 'source_test/f1_macro': 0.8996485471725464, 'source_test/f1_micro': 0.9062500596046448, 'target_test/loss': 0.5093441009521484, 'target_test/accuracy': 0.7836538553237915, 'target_test/f1': 0.7847359776496887, 'target_test/f1_macro': 0.7788974642753601, 'target_test/f1_micro': 0.7836538553237915}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.20734335482120514, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.92314612865448, 'source_test/f1_macro': 0.9167769551277161, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.5728573799133301, 'target_test/accuracy': 0.7740384936332703, 'target_test/f1': 0.7758561968803406, 'target_test/f1_macro': 0.7683931589126587, 'target_test/f1_micro': 0.7740384936332703}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.0
val/f1: 0.0
val/taskclf_loss: 1.1816684007644653
val/loss: 1.461727499961853
val/mlm_loss: 1.72428297996521


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.78125
val/f1: 0.784599244594574
val/taskclf_loss: 0.49366530776023865
val/loss: 1.2959213256835938
val/mlm_loss: 2.048036575317383


Validation: |                                                                                                 …

val/accuracy: 0.824999988079071
val/f1: 0.8273141980171204
val/taskclf_loss: 0.4389033913612366
val/loss: 1.2330257892608643
val/mlm_loss: 1.9775158166885376


Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.843777596950531
val/taskclf_loss: 0.4445437490940094
val/loss: 1.4283137321472168
val/mlm_loss: 2.3505983352661133


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.874663770198822
val/taskclf_loss: 0.34461089968681335
val/loss: 1.337195873260498
val/mlm_loss: 2.267744541168213


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.874663770198822
val/taskclf_loss: 0.35894694924354553
val/loss: 1.1370584964752197
val/mlm_loss: 1.8665379285812378


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8748245239257812
val/taskclf_loss: 0.34047552943229675
val/loss: 1.3327938318252563
val/mlm_loss: 2.263092041015625


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8810938000679016
val/taskclf_loss: 0.33777689933776855
val/loss: 1.1383211612701416
val/mlm_loss: 1.888831377029419


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.880969226360321
val/taskclf_loss: 0.3576703667640686
val/loss: 1.208145022392273
val/mlm_loss: 2.005465269088745


Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8874916434288025
val/taskclf_loss: 0.31662717461586
val/loss: 1.2195204496383667
val/mlm_loss: 2.0659830570220947


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.887499988079071
val/f1: 0.8873004913330078
val/taskclf_loss: 0.33033397793769836
val/loss: 1.2218502759933472
val/mlm_loss: 2.0576469898223877


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-BAMRPelt-epoch=04-val_loss=1.14.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/BAMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.21068254113197327, 'source_test/accuracy': 0.9326923489570618, 'source_test/f1': 0.932794451713562, 'source_test/f1_macro': 0.9283400177955627, 'source_test/f1_micro': 0.9326923489570618, 'target_test/loss': 0.6174890398979187, 'target_test/accuracy': 0.7884615659713745, 'target_test/f1': 0.7892985939979553, 'target_test/f1_macro': 0.7839758396148682, 'target_test/f1_micro': 0.7884615659713745}]
Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-BAMRPelt-epoch=04-val_loss=1.14.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/BAMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.20483781397342682, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9158632159233093, 'source_test/f1_macro': 0.9095924496650696, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.553274393081665, 'target_test/accuracy': 0.78125, 'target_test/f1': 0.7818056344985962, 'target_test/f1_macro': 0.7770858407020569, 'target_test/f1_micro': 0.78125}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.198211669921875, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.92296302318573, 'source_test/f1_macro': 0.9178155064582825, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.5562195777893066, 'target_test/accuracy': 0.7836538553237915, 'target_test/f1': 0.784787118434906, 'target_test/f1_macro': 0.7784743905067444, 'target_test/f1_micro': 0.7836538553237915}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.0625
val/f1: 0.014912281185388565
val/taskclf_loss: 1.1315951347351074
val/loss: 1.4930871725082397
val/mlm_loss: 1.8319861888885498


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8375000357627869
val/f1: 0.8380371928215027
val/taskclf_loss: 0.4284899830818176
val/loss: 1.3346232175827026
val/mlm_loss: 2.1841230392456055


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8622860312461853
val/taskclf_loss: 0.36264511942863464
val/loss: 1.2723225355148315
val/mlm_loss: 2.125145196914673


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8561367392539978
val/taskclf_loss: 0.3593480885028839
val/loss: 1.1882728338241577
val/mlm_loss: 1.9653898477554321


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8686405420303345
val/taskclf_loss: 0.36201539635658264
val/loss: 1.2080925703048706
val/mlm_loss: 2.0012900829315186


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8624613881111145
val/taskclf_loss: 0.3497251868247986
val/loss: 1.2843774557113647
val/mlm_loss: 2.160614490509033


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.868506133556366
val/taskclf_loss: 0.3232251703739166
val/loss: 1.1365162134170532
val/mlm_loss: 1.8989766836166382


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8688010573387146
val/taskclf_loss: 0.3069845139980316
val/loss: 1.223142147064209
val/mlm_loss: 2.0820400714874268


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8626434206962585
val/taskclf_loss: 0.3738044202327728
val/loss: 1.2243858575820923
val/mlm_loss: 2.021806240081787


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.856785774230957
val/taskclf_loss: 0.37379398941993713
val/loss: 1.2640379667282104
val/mlm_loss: 2.0986416339874268


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8812500238418579
val/f1: 0.8818313479423523
val/taskclf_loss: 0.3268631398677826
val/loss: 1.1773422956466675
val/mlm_loss: 1.9746665954589844


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-BAMRPelt-epoch=05-val_loss=1.14.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/BAMRPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.21837811172008514, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9356699585914612, 'source_test/f1_macro': 0.9294614195823669, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.5682530403137207, 'target_test/accuracy': 0.786057710647583, 'target_test/f1': 0.7862690091133118, 'target_test/f1_macro': 0.7816590666770935, 'target_test/f1_micro': 0.786057710647583}]
Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-BAMRPelt-epoch=05-val_loss=1.14.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/BAMRPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.20692254602909088, 'source_test/accuracy': 0.9254807829856873, 'source_test/f1': 0.9253659248352051, 'source_test/f1_macro': 0.9199961423873901, 'source_test/f1_micro': 0.9254807829856873, 'target_test/loss': 0.5453911423683167, 'target_test/accuracy': 0.7932692766189575, 'target_test/f1': 0.7945513725280762, 'target_test/f1_macro': 0.7883158922195435, 'target_test/f1_micro': 0.7932692766189575}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.20692254602909088, 'source_test/accuracy': 0.9254807829856873, 'source_test/f1': 0.9253659248352051, 'source_test/f1_macro': 0.9199961423873901, 'source_test/f1_micro': 0.9254807829856873, 'target_test/loss': 0.5453911423683167, 'target_test/accuracy': 0.7932692766189575, 'target_test/f1': 0.7945513725280762, 'target_test/f1_macro': 0.7883158922195435, 'target_test/f1_micro': 0.7932692766189575}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.21461506187915802, 0.21068254113197327, 0.21837811172008514], 'source_test/accuracy': [0.9230769872665405, 0.9326923489570618, 0.9350962042808533], 'source_test/f1': [0.9230082631111145, 0.932794451713562, 0.9356699585914612], 'source_test/f1_macro': [0.9174870848655701, 0.9283400177955627, 0.9294614195823669], 'source_test/f1_micro': [0.9230769872665405, 0.9326923489570618, 0.9350962042808533], 'target_test/loss': [0.567164957523346, 0.6174890398979187, 0.5682530403137207], 'target_test/accuracy': [0.7980769276618958, 0.7884615659713745, 0.786057710647583], 'target_test/f1': [0.7990121245384216, 0.7892985939979553, 0.7862690091133118], 'target_test/f1_macro': [0.7934900522232056, 0.7839758396148682, 0.7816590666770935], 'target_test/f1_micro': [0.7980769276618958, 0.7884615659713745, 0.786057710647583]}), ('best_model', {'source_test/loss': [0.21675506234169006, 0.20483781397342682, 0.20692254602909088], 'source_test/accuracy': [0.906

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.21455857157707214, 'source_test/accuracy': 0.9302885135014852, 'source_test/f1': 0.9304908911387125, 'source_test/f1_macro': 0.9250961740811666, 'source_test/f1_micro': 0.9302885135014852, 'target_test/loss': 0.5843023459116617, 'target_test/accuracy': 0.790865401426951, 'target_test/f1': 0.7915265758832296, 'target_test/f1_macro': 0.7863749861717224, 'target_test/f1_micro': 0.790865401426951}, 'best_model': {'source_test/loss': 0.2095051407814026, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9157858888308207, 'source_test/f1_macro': 0.9097457130750021, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.5360032121340433, 'target_test/accuracy': 0.786057710647583, 'target_test/f1': 0.7870309948921204, 'target_test/f1_macro': 0.7814330657323202, 'target_test/f1_micro': 0.786057710647583}, 'epoch_saved': {'source_test/loss': 0.20415919025739035, 'source_test/accuracy': 0.9238782525062561, 'source_test/f1':

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf