In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/home/guest/Desktop/projects/third-experiments/SDA_experiments/mlm', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpwy3lygux', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-23 07:15:25.762873: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 07:15:25.798259: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'MRBO'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "MR_books",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "mr",
            "target_domain": "books",
            "domain_adapter_name": "mlm_unipelt_books",
            "task_adapter_name": "task_MRBOPelt",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-MRBOPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="MRBOPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42


Source dataset length: 1440
Target dataset length: 1440


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(weights_file, map_location="cpu")
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_crit

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.078125
val/f1: 0.038352273404598236
val/taskclf_loss: 1.128805160522461
val/loss: 1.6528668403625488
val/mlm_loss: 2.1769285202026367


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7750000357627869
val/f1: 0.7812828421592712
val/taskclf_loss: 0.5922152400016785
val/loss: 1.3038036823272705
val/mlm_loss: 2.0153920650482178


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7750000357627869
val/f1: 0.7772002220153809
val/taskclf_loss: 0.47836801409721375
val/loss: 1.246849775314331
val/mlm_loss: 2.015331506729126


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7750000357627869
val/f1: 0.7771459817886353
val/taskclf_loss: 0.55448979139328
val/loss: 1.317535400390625
val/mlm_loss: 2.0805814266204834


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.800000011920929
val/f1: 0.8031917810440063
val/taskclf_loss: 0.5312477946281433
val/loss: 1.260378122329712
val/mlm_loss: 1.9895082712173462


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7973625063896179
val/taskclf_loss: 0.5996103286743164
val/loss: 1.2670992612838745
val/mlm_loss: 1.9345883131027222


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7946080565452576
val/taskclf_loss: 0.5280567407608032
val/loss: 1.2826896905899048
val/mlm_loss: 2.037322759628296


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7946080565452576
val/taskclf_loss: 0.5293740630149841
val/loss: 1.2520166635513306
val/mlm_loss: 1.9746592044830322


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7946080565452576
val/taskclf_loss: 0.5159169435501099
val/loss: 1.272216796875
val/mlm_loss: 2.0285167694091797


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7875000238418579
val/f1: 0.7886084318161011
val/taskclf_loss: 0.5191428065299988
val/loss: 1.2803810834884644
val/mlm_loss: 2.041619300842285


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7950267195701599
val/taskclf_loss: 0.5234270095825195
val/loss: 1.2355995178222656
val/mlm_loss: 1.9477722644805908


`Trainer.fit` stopped: `max_epochs=10` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-MRBOPelt-epoch=09-val_loss=1.24.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/MRBOPelt-epoch=05.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Test Results Last Epoch: [{'source_test/loss': 0.4412817060947418, 'source_test/accuracy': 0.8389423489570618, 'source_test/f1': 0.8387799859046936, 'source_test/f1_macro': 0.8358154892921448, 'source_test/f1_micro': 0.8389423489570618, 'target_test/loss': 0.2771300673484802, 'target_test/accuracy': 0.884615421295166, 'target_test/f1': 0.8845086097717285, 'target_test/f1_macro': 0.8810461759567261, 'target_test/f1_micro': 0.884615421295166}]
Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-MRBOPelt-epoch=09-val_loss=1.24.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/MRBOPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
S

Testing: |          | 0/? [00:00<?, ?it/s]

Test Results on Best Model: [{'source_test/loss': 0.4412817060947418, 'source_test/accuracy': 0.8389423489570618, 'source_test/f1': 0.8387799859046936, 'source_test/f1_macro': 0.8358154892921448, 'source_test/f1_micro': 0.8389423489570618, 'target_test/loss': 0.2771300673484802, 'target_test/accuracy': 0.884615421295166, 'target_test/f1': 0.8845086097717285, 'target_test/f1_macro': 0.8810461759567261, 'target_test/f1_micro': 0.884615421295166}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.433317631483078, 'source_test/accuracy': 0.8317307829856873, 'source_test/f1': 0.831380307674408, 'source_test/f1_macro': 0.8290627002716064, 'source_test/f1_micro': 0.8317307829856873, 'target_test/loss': 0.2660354971885681, 'target_test/accuracy': 0.9038462042808533, 'target_test/f1': 0.9029963612556458, 'target_test/f1_macro': 0.9012662768363953, 'target_test/f1_micro': 0.9038462042808533}]
Source dataset length: 1440
Target dataset length: 1440


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(weights_file, map_location="cpu")
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_crit

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.515625
val/f1: 0.672429084777832
val/taskclf_loss: 1.1129541397094727
val/loss: 1.54093337059021
val/mlm_loss: 1.9689126014709473


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7562500238418579
val/f1: 0.7619437575340271
val/taskclf_loss: 0.5866501331329346
val/loss: 1.3392713069915771
val/mlm_loss: 2.0918924808502197


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.78125
val/f1: 0.7819175124168396
val/taskclf_loss: 0.4594442546367645
val/loss: 1.2568590641021729
val/mlm_loss: 2.054273843765259


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.8125
val/f1: 0.8132239580154419
val/taskclf_loss: 0.4567345678806305
val/loss: 1.1936463117599487
val/mlm_loss: 1.9305578470230103


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7750000357627869
val/f1: 0.7780197858810425
val/taskclf_loss: 0.5409205555915833
val/loss: 1.288773775100708
val/mlm_loss: 2.0366268157958984


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.78125
val/f1: 0.784276008605957
val/taskclf_loss: 0.5500704050064087
val/loss: 1.2980526685714722
val/mlm_loss: 2.0460352897644043


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7960267663002014
val/taskclf_loss: 0.5786370635032654
val/loss: 1.294542908668518
val/mlm_loss: 2.010448932647705


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.8187500238418579
val/f1: 0.8194003105163574
val/taskclf_loss: 0.5212647318840027
val/loss: 1.2587093114852905
val/mlm_loss: 1.9961540699005127


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.824999988079071
val/f1: 0.8253687024116516
val/taskclf_loss: 0.5259949564933777
val/loss: 1.2454912662506104
val/mlm_loss: 1.9649876356124878


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.831250011920929
val/f1: 0.8314895033836365
val/taskclf_loss: 0.5268363952636719
val/loss: 1.2703179121017456
val/mlm_loss: 2.0137994289398193


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.824999988079071
val/f1: 0.8253677487373352
val/taskclf_loss: 0.5282890200614929
val/loss: 1.2369831800460815
val/mlm_loss: 1.9456771612167358


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-MRBOPelt-epoch=02-val_loss=1.19.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/MRBOPelt-epoch=05.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Test Results Last Epoch: [{'source_test/loss': 0.47617414593696594, 'source_test/accuracy': 0.8461538553237915, 'source_test/f1': 0.8460624814033508, 'source_test/f1_macro': 0.8429391384124756, 'source_test/f1_micro': 0.8461538553237915, 'target_test/loss': 0.2786411643028259, 'target_test/accuracy': 0.8701923489570618, 'target_test/f1': 0.8699262142181396, 'target_test/f1_macro': 0.8670653104782104, 'target_test/f1_micro': 0.8701923489570618}]
Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-MRBOPelt-epoch=02-val_loss=1.19.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/MRBOPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
S

Testing: |          | 0/? [00:00<?, ?it/s]

Test Results on Best Model: [{'source_test/loss': 0.44168591499328613, 'source_test/accuracy': 0.8125000596046448, 'source_test/f1': 0.8123400807380676, 'source_test/f1_macro': 0.8093584775924683, 'source_test/f1_micro': 0.8125000596046448, 'target_test/loss': 0.3206457793712616, 'target_test/accuracy': 0.858173131942749, 'target_test/f1': 0.8573809862136841, 'target_test/f1_macro': 0.8551806211471558, 'target_test/f1_micro': 0.858173131942749}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.5187419056892395, 'source_test/accuracy': 0.8052884936332703, 'source_test/f1': 0.8086279630661011, 'source_test/f1_macro': 0.7980600595474243, 'source_test/f1_micro': 0.8052884936332703, 'target_test/loss': 0.35976073145866394, 'target_test/accuracy': 0.8197115659713745, 'target_test/f1': 0.8250596523284912, 'target_test/f1_macro': 0.8108423352241516, 'target_test/f1_micro': 0.8197115659713745}]
Source dataset length: 1440
Target dataset length: 1440


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(weights_file, map_location="cpu")
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..
`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..
`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_crit

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.0
val/f1: 0.0
val/taskclf_loss: 1.1902472972869873
val/loss: 1.641610860824585
val/mlm_loss: 2.0929744243621826


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.762499988079071
val/f1: 0.7630568742752075
val/taskclf_loss: 0.5610562562942505
val/loss: 1.2813835144042969
val/mlm_loss: 2.001710891723633


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7875000238418579
val/f1: 0.7890188694000244
val/taskclf_loss: 0.4763394296169281
val/loss: 1.2545229196548462
val/mlm_loss: 2.0327062606811523


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.78125
val/f1: 0.7837560176849365
val/taskclf_loss: 0.5045025944709778
val/loss: 1.2858299016952515
val/mlm_loss: 2.06715726852417


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7875000238418579
val/f1: 0.7905687689781189
val/taskclf_loss: 0.5639738440513611
val/loss: 1.2663908004760742
val/mlm_loss: 1.9688078165054321


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.78125
val/f1: 0.7860992550849915
val/taskclf_loss: 0.5916518568992615
val/loss: 1.3194841146469116
val/mlm_loss: 2.047316312789917


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7953656911849976
val/taskclf_loss: 0.52274489402771
val/loss: 1.2567484378814697
val/mlm_loss: 1.9907516241073608


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.793749988079071
val/f1: 0.7950189113616943
val/taskclf_loss: 0.5194690227508545
val/loss: 1.3070529699325562
val/mlm_loss: 2.0946366786956787


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.7875000238418579
val/f1: 0.7888612747192383
val/taskclf_loss: 0.5211845636367798
val/loss: 1.2775263786315918
val/mlm_loss: 2.0338685512542725


Validation: |          | 0/? [00:00<?, ?it/s]

val/accuracy: 0.8125
val/f1: 0.8136871457099915
val/taskclf_loss: 0.5225681662559509
val/loss: 1.2481378316879272
val/mlm_loss: 1.9737074375152588


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8062500357627869
val/f1: 0.80764240026474
val/taskclf_loss: 0.5247572064399719
val/loss: 1.2724577188491821
val/mlm_loss: 2.020158529281616


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-MRBOPelt-epoch=08-val_loss=1.25.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/MRBOPelt-epoch=05.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Test Results Last Epoch: [{'source_test/loss': 0.4471236765384674, 'source_test/accuracy': 0.8245192766189575, 'source_test/f1': 0.8249337673187256, 'source_test/f1_macro': 0.8203696608543396, 'source_test/f1_micro': 0.8245192766189575, 'target_test/loss': 0.2945479452610016, 'target_test/accuracy': 0.8605769276618958, 'target_test/f1': 0.8609468340873718, 'target_test/f1_macro': 0.8561791777610779, 'target_test/f1_micro': 0.8605769276618958}]
Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-MRBOPelt-epoch=08-val_loss=1.25.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/MRBOPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
S

Testing: |          | 0/? [00:00<?, ?it/s]

Test Results on Best Model: [{'source_test/loss': 0.44402873516082764, 'source_test/accuracy': 0.8293269276618958, 'source_test/f1': 0.8296745419502258, 'source_test/f1_macro': 0.8251962065696716, 'source_test/f1_micro': 0.8293269276618958, 'target_test/loss': 0.29039710760116577, 'target_test/accuracy': 0.8629807829856873, 'target_test/f1': 0.8631165027618408, 'target_test/f1_macro': 0.8587766885757446, 'target_test/f1_micro': 0.8629807829856873}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Test Results on saved epoch: [{'source_test/loss': 0.43794018030166626, 'source_test/accuracy': 0.826923131942749, 'source_test/f1': 0.8272470831871033, 'source_test/f1_macro': 0.822793185710907, 'source_test/f1_micro': 0.826923131942749, 'target_test/loss': 0.2829190492630005, 'target_test/accuracy': 0.8750000596046448, 'target_test/f1': 0.8743046522140503, 'target_test/f1_macro': 0.8721100091934204, 'target_test/f1_micro': 0.8750000596046448}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.4412817060947418, 0.47617414593696594, 0.4471236765384674], 'source_test/accuracy': [0.8389423489570618, 0.8461538553237915, 0.8245192766189575], 'source_test/f1': [0.8387799859046936, 0.8460624814033508, 0.8249337673187256], 'source_test/f1_macro': [0.8358154892921448, 0.8429391384124756, 0.8203696608543396], 'source_test/f1_micro': [0.8389423489570618, 0.8461538553237915, 0.8245192766189575], 'target_test/loss': [0.2771300673484802, 0.2786411643028259, 0.2945479452610016], 'target_test/accuracy': [0.884615421295166, 0.8701923489570618, 0.8605769276618958], 'target_test/f1': [0.8845086097717285, 0.8699262142181396, 0.8609468340873718], 'target_test/f1_macro': [0.8810461759567261, 0.8670653104782104, 0.8561791777610779], 'target_test/f1_micro': [0.884615421295166, 0.8701923489570618, 0.8605769276618958]}), ('best_model', {'source_test/loss': [0.4412817060947418, 0.44168591499328613, 0.44402873516082764], 'source_test/accuracy': [0.8389

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.45485984285672504, 'source_test/accuracy': 0.8365384936332703, 'source_test/f1': 0.8365920782089233, 'source_test/f1_macro': 0.8330414295196533, 'source_test/f1_micro': 0.8365384936332703, 'target_test/loss': 0.2834397256374359, 'target_test/accuracy': 0.8717948993047079, 'target_test/f1': 0.8717938860257467, 'target_test/f1_macro': 0.8680968880653381, 'target_test/f1_micro': 0.8717948993047079}, 'best_model': {'source_test/loss': 0.44233211874961853, 'source_test/accuracy': 0.8269231120745341, 'source_test/f1': 0.8269315361976624, 'source_test/f1_macro': 0.8234567244847616, 'source_test/f1_micro': 0.8269231120745341, 'target_test/loss': 0.29605765144030255, 'target_test/accuracy': 0.8685897787412008, 'target_test/f1': 0.8683353662490845, 'target_test/f1_macro': 0.8650011618932089, 'target_test/f1_micro': 0.8685897787412008}, 'epoch_saved': {'source_test/loss': 0.46333323915799457, 'source_test/accuracy': 0.8213141361872355, 'source_t

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf

: 