In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpo98d4bgc', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-29 21:17:03.314146: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-29 21:17:03.345905: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'BAA'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "baby_apparel",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "baby",
            "target_domain": "apparel",
            "domain_adapter_name": "mlm_union_apparel",
            "task_adapter_name": "BAAUni",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-BAAUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="BAAUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.4375
val/f1: 0.6080378293991089
val/taskclf_loss: 1.120105266571045
val/loss: 1.5853381156921387
val/mlm_loss: 2.021494150161743


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8875061273574829
val/taskclf_loss: 0.3098408877849579
val/loss: 1.3693174123764038
val/mlm_loss: 2.362576961517334


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8621757626533508
val/taskclf_loss: 0.3457273244857788
val/loss: 1.2424601316452026
val/mlm_loss: 2.0831470489501953


Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8872259259223938
val/taskclf_loss: 0.27737608551979065
val/loss: 1.1973834037780762
val/mlm_loss: 2.0598902702331543


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8750727772712708
val/taskclf_loss: 0.35640621185302734
val/loss: 1.2980139255523682
val/mlm_loss: 2.1807711124420166


Validation: |                                                                                                 …

val/accuracy: 0.8375000357627869
val/f1: 0.8404635787010193
val/taskclf_loss: 0.6099863052368164
val/loss: 1.4341627359390259
val/mlm_loss: 2.2068281173706055


Validation: |                                                                                                 …

val/accuracy: 0.831250011920929
val/f1: 0.8333307504653931
val/taskclf_loss: 0.6171162128448486
val/loss: 1.3933533430099487
val/mlm_loss: 2.1210756301879883


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8750236630439758
val/taskclf_loss: 0.3955211341381073
val/loss: 1.3077831268310547
val/mlm_loss: 2.1630287170410156


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8750236630439758
val/taskclf_loss: 0.40864911675453186
val/loss: 1.2678779363632202
val/mlm_loss: 2.0734052658081055


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8750236630439758
val/taskclf_loss: 0.4304032325744629
val/loss: 1.3279811143875122
val/mlm_loss: 2.1694605350494385


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.875
val/f1: 0.8750236630439758
val/taskclf_loss: 0.43013009428977966
val/loss: 1.2753323316574097
val/mlm_loss: 2.0677096843719482


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_24/checkpoints/task-BAAUni-epoch=02-val_loss=1.20.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_24/checkpoints/BAAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.26288849115371704, 'source_test/accuracy': 0.9182692766189575, 'source_test/f1': 0.9189677238464355, 'source_test/f1_macro': 0.9112136960029602, 'source_test/f1_micro': 0.9182692766189575, 'target_test/loss': 0.31636372208595276, 'target_test/accuracy': 0.915865421295166, 'target_test/f1': 0.9154680371284485, 'target_test/f1_macro': 0.9129036068916321, 'target_test/f1_micro': 0.915865421295166}]
Best checkpoint path: checkpoints/lightning_logs/version_24/checkpoints/task-BAAUni-epoch=02-val_loss=1.20.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_24/checkpoints/BAAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.20023129880428314, 'source_test/accuracy': 0.9278846383094788, 'source_test/f1': 0.9278393387794495, 'source_test/f1_macro': 0.9220852851867676, 'source_test/f1_micro': 0.9278846383094788, 'target_test/loss': 0.24797458946704865, 'target_test/accuracy': 0.9062500596046448, 'target_test/f1': 0.9054768681526184, 'target_test/f1_macro': 0.9035211801528931, 'target_test/f1_micro': 0.9062500596046448}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.29821106791496277, 'source_test/accuracy': 0.9086538553237915, 'source_test/f1': 0.908907949924469, 'source_test/f1_macro': 0.9018179178237915, 'source_test/f1_micro': 0.9086538553237915, 'target_test/loss': 0.36958691477775574, 'target_test/accuracy': 0.889423131942749, 'target_test/f1': 0.8885972499847412, 'target_test/f1_macro': 0.8871983885765076, 'target_test/f1_micro': 0.889423131942749}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.375
val/f1: 0.40719151496887207
val/taskclf_loss: 1.1016285419464111
val/loss: 1.5776252746582031
val/mlm_loss: 2.0238723754882812


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8748372197151184
val/taskclf_loss: 0.30652543902397156
val/loss: 1.2273094654083252
val/mlm_loss: 2.0905444622039795


Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.8996514678001404
val/taskclf_loss: 0.283817857503891
val/loss: 1.2343591451644897
val/mlm_loss: 2.1254918575286865


Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.8445386290550232
val/taskclf_loss: 0.40446239709854126
val/loss: 1.2891693115234375
val/mlm_loss: 2.118582248687744


Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8873489499092102
val/taskclf_loss: 0.2844094932079315
val/loss: 1.184091567993164
val/mlm_loss: 2.027543783187866


Validation: |                                                                                                 …

val/accuracy: 0.8187500238418579
val/f1: 0.8226668238639832
val/taskclf_loss: 0.6525521278381348
val/loss: 1.3964587450027466
val/mlm_loss: 2.0938713550567627


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8690487742424011
val/taskclf_loss: 0.538812518119812
val/loss: 1.3705376386642456
val/mlm_loss: 2.1502797603607178


Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.9008117914199829
val/taskclf_loss: 0.36370235681533813
val/loss: 1.2311546802520752
val/mlm_loss: 2.044391393661499


Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8874613046646118
val/taskclf_loss: 0.3923245072364807
val/loss: 1.2432714700698853
val/mlm_loss: 2.041034460067749


Validation: |                                                                                                 …

val/accuracy: 0.887499988079071
val/f1: 0.8874613046646118
val/taskclf_loss: 0.39350625872612
val/loss: 1.2375690937042236
val/mlm_loss: 2.0288779735565186


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.887499988079071
val/f1: 0.8874613046646118
val/taskclf_loss: 0.39376720786094666
val/loss: 1.2235634326934814
val/mlm_loss: 2.001497268676758


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_25/checkpoints/task-BAAUni-epoch=03-val_loss=1.18.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_25/checkpoints/BAAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2584507167339325, 'source_test/accuracy': 0.9302884936332703, 'source_test/f1': 0.930574357509613, 'source_test/f1_macro': 0.9241284728050232, 'source_test/f1_micro': 0.9302884936332703, 'target_test/loss': 0.32511916756629944, 'target_test/accuracy': 0.9134615659713745, 'target_test/f1': 0.9134844541549683, 'target_test/f1_macro': 0.9096165299415588, 'target_test/f1_micro': 0.9134615659713745}]
Best checkpoint path: checkpoints/lightning_logs/version_25/checkpoints/task-BAAUni-epoch=03-val_loss=1.18.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_25/checkpoints/BAAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.19801834225654602, 'source_test/accuracy': 0.9278846383094788, 'source_test/f1': 0.9285699725151062, 'source_test/f1_macro': 0.9212303757667542, 'source_test/f1_micro': 0.9278846383094788, 'target_test/loss': 0.263218492269516, 'target_test/accuracy': 0.8990384936332703, 'target_test/f1': 0.8985690474510193, 'target_test/f1_macro': 0.8953264355659485, 'target_test/f1_micro': 0.8990384936332703}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.2658151388168335, 'source_test/accuracy': 0.920673131942749, 'source_test/f1': 0.9206832051277161, 'source_test/f1_macro': 0.9148382544517517, 'source_test/f1_micro': 0.920673131942749, 'target_test/loss': 0.3396371901035309, 'target_test/accuracy': 0.9062500596046448, 'target_test/f1': 0.9054821133613586, 'target_test/f1_macro': 0.9035524725914001, 'target_test/f1_micro': 0.9062500596046448}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.09375
val/f1: 0.03599211573600769
val/taskclf_loss: 1.1344900131225586
val/loss: 1.6924877166748047
val/mlm_loss: 2.2156105041503906


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8747878074645996
val/taskclf_loss: 0.3232666850090027
val/loss: 1.2796027660369873
val/mlm_loss: 2.1761679649353027


Validation: |                                                                                                 …

val/accuracy: 0.893750011920929
val/f1: 0.8934937715530396
val/taskclf_loss: 0.29167965054512024
val/loss: 1.2200065851211548
val/mlm_loss: 2.090312957763672


Validation: |                                                                                                 …

val/accuracy: 0.8125
val/f1: 0.8134862780570984
val/taskclf_loss: 0.5419082641601562
val/loss: 1.375344157218933
val/mlm_loss: 2.1566905975341797


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8748858571052551
val/taskclf_loss: 0.3624224066734314
val/loss: 1.2163156270980835
val/mlm_loss: 2.0168404579162598


Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.8452046513557434
val/taskclf_loss: 0.49828043580055237
val/loss: 1.2994273900985718
val/mlm_loss: 2.0505030155181885


Validation: |                                                                                                 …

val/accuracy: 0.8187500238418579
val/f1: 0.8212005496025085
val/taskclf_loss: 0.6340177059173584
val/loss: 1.376893401145935
val/mlm_loss: 2.0733397006988525


Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.8998001217842102
val/taskclf_loss: 0.3808436691761017
val/loss: 1.2745338678359985
val/mlm_loss: 2.1123688220977783


Validation: |                                                                                                 …

val/accuracy: 0.90625
val/f1: 0.9060066342353821
val/taskclf_loss: 0.3829919397830963
val/loss: 1.268903136253357
val/mlm_loss: 2.099445104598999


Validation: |                                                                                                 …

val/accuracy: 0.893750011920929
val/f1: 0.893568217754364
val/taskclf_loss: 0.3819461166858673
val/loss: 1.2983301877975464
val/mlm_loss: 2.157440423965454


Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.8999619483947754
val/taskclf_loss: 0.40509033203125
val/loss: 1.2117664813995361
val/mlm_loss: 1.9680252075195312


`Trainer.fit` stopped: `max_epochs=10` reached.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_26/checkpoints/task-BAAUni-epoch=09-val_loss=1.21.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_26/checkpoints/BAAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2582394778728485, 'source_test/accuracy': 0.9302884936332703, 'source_test/f1': 0.9305768609046936, 'source_test/f1_macro': 0.9248966574668884, 'source_test/f1_micro': 0.9302884936332703, 'target_test/loss': 0.3166048526763916, 'target_test/accuracy': 0.9182692766189575, 'target_test/f1': 0.9179753065109253, 'target_test/f1_macro': 0.9152382016181946, 'target_test/f1_micro': 0.9182692766189575}]
Best checkpoint path: checkpoints/lightning_logs/version_26/checkpoints/task-BAAUni-epoch=09-val_loss=1.21.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_26/checkpoints/BAAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2582394778728485, 'source_test/accuracy': 0.9302884936332703, 'source_test/f1': 0.9305768609046936, 'source_test/f1_macro': 0.9248966574668884, 'source_test/f1_micro': 0.9302884936332703, 'target_test/loss': 0.3166048526763916, 'target_test/accuracy': 0.9182692766189575, 'target_test/f1': 0.9179753065109253, 'target_test/f1_macro': 0.9152382016181946, 'target_test/f1_micro': 0.9182692766189575}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.32999521493911743, 'source_test/accuracy': 0.8918269276618958, 'source_test/f1': 0.8921500444412231, 'source_test/f1_macro': 0.8847072720527649, 'source_test/f1_micro': 0.8918269276618958, 'target_test/loss': 0.40586477518081665, 'target_test/accuracy': 0.8822115659713745, 'target_test/f1': 0.8813265562057495, 'target_test/f1_macro': 0.8797746896743774, 'target_test/f1_micro': 0.8822115659713745}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.26288849115371704, 0.2584507167339325, 0.2582394778728485], 'source_test/accuracy': [0.9182692766189575, 0.9302884936332703, 0.9302884936332703], 'source_test/f1': [0.9189677238464355, 0.930574357509613, 0.9305768609046936], 'source_test/f1_macro': [0.9112136960029602, 0.9241284728050232, 0.9248966574668884], 'source_test/f1_micro': [0.9182692766189575, 0.9302884936332703, 0.9302884936332703], 'target_test/loss': [0.31636372208595276, 0.32511916756629944, 0.3166048526763916], 'target_test/accuracy': [0.915865421295166, 0.9134615659713745, 0.9182692766189575], 'target_test/f1': [0.9154680371284485, 0.9134844541549683, 0.9179753065109253], 'target_test/f1_macro': [0.9129036068916321, 0.9096165299415588, 0.9152382016181946], 'target_test/f1_micro': [0.915865421295166, 0.9134615659713745, 0.9182692766189575]}), ('best_model', {'source_test/loss': [0.20023129880428314, 0.19801834225654602, 0.2582394778728485], 'source_test/accuracy': [0.927

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.259859561920166, 'source_test/accuracy': 0.9262820879618326, 'source_test/f1': 0.9267063140869141, 'source_test/f1_macro': 0.9200796087582906, 'source_test/f1_micro': 0.9262820879618326, 'target_test/loss': 0.3193625807762146, 'target_test/accuracy': 0.915865421295166, 'target_test/f1': 0.9156425992647806, 'target_test/f1_macro': 0.9125861128171285, 'target_test/f1_micro': 0.915865421295166}, 'best_model': {'source_test/loss': 0.2188297063112259, 'source_test/accuracy': 0.9286859234174093, 'source_test/f1': 0.9289953907330831, 'source_test/f1_macro': 0.9227374394734701, 'source_test/f1_micro': 0.9286859234174093, 'target_test/loss': 0.2759326448043187, 'target_test/accuracy': 0.9078526099522909, 'target_test/f1': 0.907340407371521, 'target_test/f1_macro': 0.9046952724456787, 'target_test/f1_micro': 0.9078526099522909}, 'epoch_saved': {'source_test/loss': 0.29800714055697125, 'source_test/accuracy': 0.9070513049761454, 'source_test/f1'

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf