In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmplf1ge_80', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-22 07:39:10.376942: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-22 07:39:10.521348: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'STE'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "slate_telephone",
            "source_domain": "slate",
            "target_domain": "telephone",
            "domain_adapter_name": "mlm_union_TE",
            "task_adapter_name": "STEUni",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-STEUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="STEUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Source genre: slate
Target genre: telephone
Number of target samples: 75013


Source genre: slate
Target genre: telephone
Number of target samples: 75013


Source dataset length: 69575
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/amp.py:55: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.28125
val/f1: 0.36440348625183105
val/taskclf_loss: 1.1094970703125
val/loss: 1.2947421073913574
val/mlm_loss: 1.8145205974578857


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7362167835235596
val/f1: 0.7373403906822205
val/taskclf_loss: 0.62300044298172
val/loss: 0.9088782668113708
val/mlm_loss: 1.7110215425491333


Validation: |                                                                                                 …

val/accuracy: 0.7564022541046143
val/f1: 0.7564969062805176
val/taskclf_loss: 0.6040964722633362
val/loss: 0.8894899487495422
val/mlm_loss: 1.6902742385864258


Validation: |                                                                                                 …

val/accuracy: 0.7566196918487549
val/f1: 0.7572610974311829
val/taskclf_loss: 0.6223044991493225
val/loss: 0.9024817943572998
val/mlm_loss: 1.6886299848556519


Validation: |                                                                                                 …

val/accuracy: 0.7553691864013672
val/f1: 0.7566969990730286
val/taskclf_loss: 0.6620020866394043
val/loss: 0.9336988925933838
val/mlm_loss: 1.6960519552230835


Validation: |                                                                                                 …

val/accuracy: 0.7503330111503601
val/f1: 0.7510454058647156
val/taskclf_loss: 0.7441822290420532
val/loss: 0.9904718995094299
val/mlm_loss: 1.6815351247787476


Validation: |                                                                                                 …

val/accuracy: 0.7589848637580872
val/f1: 0.7588109374046326
val/taskclf_loss: 0.828850269317627
val/loss: 1.0528327226638794
val/mlm_loss: 1.6813040971755981


Validation: |                                                                                                 …

val/accuracy: 0.7592431306838989
val/f1: 0.7589290142059326
val/taskclf_loss: 0.8668319582939148
val/loss: 1.0812046527862549
val/mlm_loss: 1.682712435722351


Validation: |                                                                                                 …

val/accuracy: 0.7548526525497437
val/f1: 0.7548421621322632
val/taskclf_loss: 0.9110175967216492
val/loss: 1.109224796295166
val/mlm_loss: 1.665373682975769


Validation: |                                                                                                 …

val/accuracy: 0.760276198387146
val/f1: 0.7598219513893127
val/taskclf_loss: 0.8910138607025146
val/loss: 1.0974698066711426
val/mlm_loss: 1.6767635345458984


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.760276198387146
val/f1: 0.7597655653953552
val/taskclf_loss: 0.8933378458023071
val/loss: 1.0938732624053955
val/mlm_loss: 1.6565548181533813


Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-STEUni-epoch=01-val_loss=0.89.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/STEUni-epoch=05.ckpt


Source genre: slate
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.9296032190322876, 'source_test/accuracy': 0.7620247602462769, 'source_test/f1': 0.760951817035675, 'source_test/f1_macro': 0.7537253499031067, 'source_test/f1_micro': 0.7620247602462769, 'target_test/loss': 0.7408133149147034, 'target_test/accuracy': 0.7880184054374695, 'target_test/f1': 0.7875913977622986, 'target_test/f1_macro': 0.7794213891029358, 'target_test/f1_micro': 0.7880184054374695}]
Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-STEUni-epoch=01-val_loss=0.89.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/STEUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6355653405189514, 'source_test/accuracy': 0.7489199042320251, 'source_test/f1': 0.7482152581214905, 'source_test/f1_macro': 0.7409502267837524, 'source_test/f1_micro': 0.7489199042320251, 'target_test/loss': 0.5555121898651123, 'target_test/accuracy': 0.781466007232666, 'target_test/f1': 0.7818046808242798, 'target_test/f1_macro': 0.770185112953186, 'target_test/f1_micro': 0.781466007232666}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.8791390657424927, 'source_test/accuracy': 0.7554723620414734, 'source_test/f1': 0.7548021674156189, 'source_test/f1_macro': 0.7465552091598511, 'source_test/f1_micro': 0.7554723620414734, 'target_test/loss': 0.6901920437812805, 'target_test/accuracy': 0.7849942445755005, 'target_test/f1': 0.7855625748634338, 'target_test/f1_macro': 0.775485098361969, 'target_test/f1_micro': 0.7849942445755005}]
Batch size: 32


Source genre: slate


Target genre: telephone
Number of target samples: 75013


Source genre: slate
Target genre: telephone
Number of target samples: 75013
Source dataset length: 69575
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.203125
val/f1: 0.2623331546783447
val/taskclf_loss: 1.115142822265625
val/loss: 1.3145127296447754
val/mlm_loss: 1.8739242553710938


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7387518286705017
val/f1: 0.7396082878112793
val/taskclf_loss: 0.625933825969696
val/loss: 0.9088324904441833
val/mlm_loss: 1.7026164531707764


Validation: |                                                                                                 …

val/accuracy: 0.7545943856239319
val/f1: 0.7550539970397949
val/taskclf_loss: 0.6001542806625366
val/loss: 0.8873560428619385
val/mlm_loss: 1.6932145357131958


Validation: |                                                                                                 …

val/accuracy: 0.7582576274871826
val/f1: 0.759665310382843
val/taskclf_loss: 0.617860734462738
val/loss: 0.8971555829048157
val/mlm_loss: 1.6808282136917114


Validation: |                                                                                                 …

val/accuracy: 0.7572653293609619
val/f1: 0.7579877972602844
val/taskclf_loss: 0.6603451371192932
val/loss: 0.9321371912956238
val/mlm_loss: 1.694757342338562


Validation: |                                                                                                 …

val/accuracy: 0.7572653293609619
val/f1: 0.7585025429725647
val/taskclf_loss: 0.6959840059280396
val/loss: 0.9555988907814026
val/mlm_loss: 1.684051275253296


Validation: |                                                                                                 …

val/accuracy: 0.7640685439109802
val/f1: 0.7638418078422546
val/taskclf_loss: 0.8060318231582642
val/loss: 1.0343241691589355
val/mlm_loss: 1.6748888492584229


Validation: |                                                                                                 …

val/accuracy: 0.7663522362709045
val/f1: 0.7661585807800293
val/taskclf_loss: 0.8529917597770691
val/loss: 1.0669221878051758
val/mlm_loss: 1.667189121246338


Validation: |                                                                                                 …

val/accuracy: 0.7671270370483398
val/f1: 0.7670072317123413
val/taskclf_loss: 0.8805840611457825
val/loss: 1.0851891040802002
val/mlm_loss: 1.6592899560928345


Validation: |                                                                                                 …

val/accuracy: 0.7671270370483398
val/f1: 0.7667848467826843
val/taskclf_loss: 0.8665807247161865
val/loss: 1.078875184059143
val/mlm_loss: 1.6745511293411255


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.7668687701225281
val/f1: 0.766633927822113
val/taskclf_loss: 0.8699375987052917
val/loss: 1.0772453546524048
val/mlm_loss: 1.658929467201233


Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-STEUni-epoch=01-val_loss=0.89.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/STEUni-epoch=05.ckpt


Source genre: slate
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.8986736536026001, 'source_test/accuracy': 0.758352518081665, 'source_test/f1': 0.7581194043159485, 'source_test/f1_macro': 0.7489322423934937, 'source_test/f1_micro': 0.758352518081665, 'target_test/loss': 0.7571266889572144, 'target_test/accuracy': 0.7804579138755798, 'target_test/f1': 0.7808834910392761, 'target_test/f1_macro': 0.7700367569923401, 'target_test/f1_micro': 0.7804579138755798}]
Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-STEUni-epoch=01-val_loss=0.89.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/STEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6368027925491333, 'source_test/accuracy': 0.7475517988204956, 'source_test/f1': 0.7484863996505737, 'source_test/f1_macro': 0.7355306148529053, 'source_test/f1_micro': 0.7475517988204956, 'target_test/loss': 0.5470713973045349, 'target_test/accuracy': 0.781466007232666, 'target_test/f1': 0.7825949192047119, 'target_test/f1_macro': 0.7700400352478027, 'target_test/f1_micro': 0.781466007232666}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.8369835615158081, 'source_test/accuracy': 0.7620247602462769, 'source_test/f1': 0.7618276476860046, 'source_test/f1_macro': 0.7524399161338806, 'source_test/f1_micro': 0.7620247602462769, 'target_test/loss': 0.6964091658592224, 'target_test/accuracy': 0.7849942445755005, 'target_test/f1': 0.7854841351509094, 'target_test/f1_macro': 0.7735916376113892, 'target_test/f1_micro': 0.7849942445755005}]
Batch size: 32


Source genre: slate


Target genre: telephone
Number of target samples: 75013


Source genre: slate
Target genre: telephone
Number of target samples: 75013


Source dataset length: 69575
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.46875
val/f1: 0.6376811265945435
val/taskclf_loss: 1.06817626953125
val/loss: 1.2646101713180542
val/mlm_loss: 1.8157832622528076


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7326826453208923
val/f1: 0.732309877872467
val/taskclf_loss: 0.632810652256012
val/loss: 0.9123727679252625
val/mlm_loss: 1.6967949867248535


Validation: |                                                                                                 …

val/accuracy: 0.7526165843009949
val/f1: 0.7536707520484924
val/taskclf_loss: 0.6108440160751343
val/loss: 0.8993319869041443
val/mlm_loss: 1.708799123764038


Validation: |                                                                                                 …

val/accuracy: 0.7582100629806519
val/f1: 0.7586303949356079
val/taskclf_loss: 0.6217656135559082
val/loss: 0.8990458250045776
val/mlm_loss: 1.677065134048462


Validation: |                                                                                                 …

val/accuracy: 0.7546418905258179
val/f1: 0.7552939057350159
val/taskclf_loss: 0.6452203392982483
val/loss: 0.9180566072463989
val/mlm_loss: 1.683606743812561


Validation: |                                                                                                 …

val/accuracy: 0.7560148239135742
val/f1: 0.7566142678260803
val/taskclf_loss: 0.7224615216255188
val/loss: 0.9769792556762695
val/mlm_loss: 1.691129446029663


Validation: |                                                                                                 …

val/accuracy: 0.7490416765213013
val/f1: 0.7508040070533752
val/taskclf_loss: 0.789377748966217
val/loss: 1.0303195714950562
val/mlm_loss: 1.7063770294189453


Validation: |                                                                                                 …

val/accuracy: 0.7632462382316589
val/f1: 0.7626950144767761
val/taskclf_loss: 0.8840320706367493
val/loss: 1.0936893224716187
val/mlm_loss: 1.6819653511047363


Validation: |                                                                                                 …

val/accuracy: 0.7630763053894043
val/f1: 0.7627162337303162
val/taskclf_loss: 0.9181881546974182
val/loss: 1.114625096321106
val/mlm_loss: 1.665806770324707


Validation: |                                                                                                 …

val/accuracy: 0.762301504611969
val/f1: 0.7617354989051819
val/taskclf_loss: 0.9542267918586731
val/loss: 1.1410448551177979
val/mlm_loss: 1.6652367115020752


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.7649725079536438
val/f1: 0.7643362879753113
val/taskclf_loss: 0.9448283314704895
val/loss: 1.1343330144882202
val/mlm_loss: 1.6660631895065308


Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-STEUni-epoch=02-val_loss=0.90.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/STEUni-epoch=05.ckpt


Source genre: slate
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.9857902526855469, 'source_test/accuracy': 0.7590005993843079, 'source_test/f1': 0.7583466172218323, 'source_test/f1_macro': 0.7504287362098694, 'source_test/f1_micro': 0.7590005993843079, 'target_test/loss': 0.8140583634376526, 'target_test/accuracy': 0.784490168094635, 'target_test/f1': 0.7846216559410095, 'target_test/f1_macro': 0.7741820216178894, 'target_test/f1_micro': 0.784490168094635}]
Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-STEUni-epoch=02-val_loss=0.90.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/STEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6395341157913208, 'source_test/accuracy': 0.7584965229034424, 'source_test/f1': 0.7589713931083679, 'source_test/f1_macro': 0.7479320168495178, 'source_test/f1_micro': 0.7584965229034424, 'target_test/loss': 0.5592954158782959, 'target_test/accuracy': 0.7744095325469971, 'target_test/f1': 0.7760153412818909, 'target_test/f1_macro': 0.7616340517997742, 'target_test/f1_micro': 0.7744095325469971}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.8007138967514038, 'source_test/accuracy': 0.7546082735061646, 'source_test/f1': 0.754725992679596, 'source_test/f1_macro': 0.7432509660720825, 'source_test/f1_micro': 0.7546082735061646, 'target_test/loss': 0.7045333981513977, 'target_test/accuracy': 0.7653369903564453, 'target_test/f1': 0.7682718634605408, 'target_test/f1_macro': 0.7485380172729492, 'target_test/f1_micro': 0.7653369903564453}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.9296032190322876, 0.8986736536026001, 0.9857902526855469], 'source_test/accuracy': [0.7620247602462769, 0.758352518081665, 0.7590005993843079], 'source_test/f1': [0.760951817035675, 0.7581194043159485, 0.7583466172218323], 'source_test/f1_macro': [0.7537253499031067, 0.7489322423934937, 0.7504287362098694], 'source_test/f1_micro': [0.7620247602462769, 0.758352518081665, 0.7590005993843079], 'target_test/loss': [0.7408133149147034, 0.7571266889572144, 0.8140583634376526], 'target_test/accuracy': [0.7880184054374695, 0.7804579138755798, 0.784490168094635], 'target_test/f1': [0.7875913977622986, 0.7808834910392761, 0.7846216559410095], 'target_test/f1_macro': [0.7794213891029358, 0.7700367569923401, 0.7741820216178894], 'target_test/f1_micro': [0.7880184054374695, 0.7804579138755798, 0.784490168094635]}), ('best_model', {'source_test/loss': [0.6355653405189514, 0.6368027925491333, 0.6395341157913208], 'source_test/accuracy': [0.7489199042

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.9380223751068115, 'source_test/accuracy': 0.7597926259040833, 'source_test/f1': 0.7591392795244852, 'source_test/f1_macro': 0.7510287761688232, 'source_test/f1_micro': 0.7597926259040833, 'target_test/loss': 0.7706661224365234, 'target_test/accuracy': 0.7843221624692281, 'target_test/f1': 0.7843655149141947, 'target_test/f1_macro': 0.7745467225710551, 'target_test/f1_micro': 0.7843221624692281}, 'best_model': {'source_test/loss': 0.6373007496198019, 'source_test/accuracy': 0.7516560753186544, 'source_test/f1': 0.751891016960144, 'source_test/f1_macro': 0.7414709528287252, 'source_test/f1_micro': 0.7516560753186544, 'target_test/loss': 0.5539596676826477, 'target_test/accuracy': 0.7791138490041097, 'target_test/f1': 0.7801383137702942, 'target_test/f1_macro': 0.7672864000002543, 'target_test/f1_micro': 0.7791138490041097}, 'epoch_saved': {'source_test/loss': 0.8389455080032349, 'source_test/accuracy': 0.7573684652646383, 'source_test/f

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf