In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmp2kjt4lkb', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-26 00:39:03.277846: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-26 00:39:03.526472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'GTE'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "government_slate",
            "source_domain": "government",
            "target_domain": "slate",
            "domain_adapter_name": "mlm_unipelt_S",
            "task_adapter_name": "GSPelt",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-GSPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="GSPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Source genre: government
Target genre: slate
Number of target samples: 69575


Source genre: government
Target genre: slate
Number of target samples: 69575


Source dataset length: 69615
Target dataset length: 21585


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/amp.py:55: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


Missing logger folder: checkpoints/lightning_logs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.390625
val/f1: 0.5203136205673218
val/taskclf_loss: 1.099151611328125
val/loss: 1.282841682434082
val/mlm_loss: 1.8752707242965698


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7953029274940491
val/f1: 0.7949532270431519
val/taskclf_loss: 0.5212091207504272
val/loss: 0.8324481844902039
val/mlm_loss: 1.8362427949905396


Validation: |                                                                                                 …

val/accuracy: 0.8087326884269714
val/f1: 0.808605432510376
val/taskclf_loss: 0.4889548718929291
val/loss: 0.8065679669380188
val/mlm_loss: 1.830919861793518


Validation: |                                                                                                 …

val/accuracy: 0.816868007183075
val/f1: 0.8168425559997559
val/taskclf_loss: 0.47342193126678467
val/loss: 0.7946759462356567
val/mlm_loss: 1.8307702541351318


Validation: |                                                                                                 …

val/accuracy: 0.8222915530204773
val/f1: 0.8219983577728271
val/taskclf_loss: 0.4773051142692566
val/loss: 0.7946895360946655
val/mlm_loss: 1.8183038234710693


Validation: |                                                                                                 …

val/accuracy: 0.8223420977592468
val/f1: 0.8219895362854004
val/taskclf_loss: 0.47403842210769653
val/loss: 0.7905158400535583
val/mlm_loss: 1.8112051486968994


Validation: |                                                                                                 …

val/accuracy: 0.8286190629005432
val/f1: 0.8279992938041687
val/taskclf_loss: 0.4646849036216736
val/loss: 0.7872755527496338
val/mlm_loss: 1.8276809453964233


Validation: |                                                                                                 …

val/accuracy: 0.829702615737915
val/f1: 0.8294882774353027
val/taskclf_loss: 0.4704993963241577
val/loss: 0.7893946170806885
val/mlm_loss: 1.8178815841674805


Validation: |                                                                                                 …

val/accuracy: 0.829056978225708
val/f1: 0.8286086916923523
val/taskclf_loss: 0.5009613633155823
val/loss: 0.8094516396522522
val/mlm_loss: 1.8043808937072754


Validation: |                                                                                                 …

val/accuracy: 0.8324143886566162
val/f1: 0.8320471048355103
val/taskclf_loss: 0.510357141494751
val/loss: 0.8201983571052551
val/mlm_loss: 1.8194847106933594


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.841324508190155
val/f1: 0.8411177396774292
val/taskclf_loss: 0.5005245208740234
val/loss: 0.8088973760604858
val/mlm_loss: 1.8034480810165405


Best checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/task-GSPelt-epoch=05-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/GSPelt-epoch=05.ckpt


Source genre: government


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Target genre: slate
Number of target samples: 69575


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.4814300537109375, 'source_test/accuracy': 0.8452620506286621, 'source_test/f1': 0.844123125076294, 'source_test/f1_macro': 0.8417439460754395, 'source_test/f1_micro': 0.8452620506286621, 'target_test/loss': 0.8109395503997803, 'target_test/accuracy': 0.7368951439857483, 'target_test/f1': 0.7363978624343872, 'target_test/f1_macro': 0.727328896522522, 'target_test/f1_micro': 0.7368951439857483}]
Best checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/task-GSPelt-epoch=05-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/GSPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.4733338952064514, 'source_test/accuracy': 0.831149160861969, 'source_test/f1': 0.8298832774162292, 'source_test/f1_macro': 0.8269955515861511, 'source_test/f1_micro': 0.831149160861969, 'target_test/loss': 0.7398728132247925, 'target_test/accuracy': 0.7278225421905518, 'target_test/f1': 0.7266727089881897, 'target_test/f1_macro': 0.7192935943603516, 'target_test/f1_micro': 0.7278225421905518}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.4733338952064514, 'source_test/accuracy': 0.831149160861969, 'source_test/f1': 0.8298832774162292, 'source_test/f1_macro': 0.8269955515861511, 'source_test/f1_micro': 0.831149160861969, 'target_test/loss': 0.7398728132247925, 'target_test/accuracy': 0.7278225421905518, 'target_test/f1': 0.7266727089881897, 'target_test/f1_macro': 0.7192935943603516, 'target_test/f1_micro': 0.7278225421905518}]
Batch size: 32


Source genre: government
Target genre: slate
Number of target samples: 69575


Source genre: government


Target genre: slate
Number of target samples: 69575
Source dataset length: 69615
Target dataset length: 21585


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.34375
val/f1: 0.4886627197265625
val/taskclf_loss: 1.1012039184570312
val/loss: 1.2863826751708984
val/mlm_loss: 1.883613109588623


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7929279804229736
val/f1: 0.7927302718162537
val/taskclf_loss: 0.5245776772499084
val/loss: 0.8405449986457825
val/mlm_loss: 1.8595889806747437


Validation: |                                                                                                 …

val/accuracy: 0.8120900988578796
val/f1: 0.8118540644645691
val/taskclf_loss: 0.4818432927131653
val/loss: 0.7991549372673035
val/mlm_loss: 1.8225346803665161


Validation: |                                                                                                 …

val/accuracy: 0.8210002183914185
val/f1: 0.8207798004150391
val/taskclf_loss: 0.47093167901039124
val/loss: 0.7943932414054871
val/mlm_loss: 1.8376076221466064


Validation: |                                                                                                 …

val/accuracy: 0.8233246207237244
val/f1: 0.8230311870574951
val/taskclf_loss: 0.4664006531238556
val/loss: 0.7880595922470093
val/mlm_loss: 1.8254600763320923


Validation: |                                                                                                 …

val/accuracy: 0.8252615928649902
val/f1: 0.8248100876808167
val/taskclf_loss: 0.46048203110694885
val/loss: 0.7850956916809082
val/mlm_loss: 1.8320256471633911


Validation: |                                                                                                 …

val/accuracy: 0.8222915530204773
val/f1: 0.821616530418396
val/taskclf_loss: 0.4868651330471039
val/loss: 0.7988393306732178
val/mlm_loss: 1.8050047159194946


Validation: |                                                                                                 …

val/accuracy: 0.8274568319320679
val/f1: 0.8268787264823914
val/taskclf_loss: 0.49015507102012634
val/loss: 0.80314040184021
val/mlm_loss: 1.8125669956207275


Validation: |                                                                                                 …

val/accuracy: 0.8266820907592773
val/f1: 0.8261536955833435
val/taskclf_loss: 0.49987465143203735
val/loss: 0.8120371699333191
val/mlm_loss: 1.818810224533081


Validation: |                                                                                                 …

val/accuracy: 0.8385116457939148
val/f1: 0.838253915309906
val/taskclf_loss: 0.48924267292022705
val/loss: 0.8045188784599304
val/mlm_loss: 1.8213341236114502


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8386408090591431
val/f1: 0.838408887386322
val/taskclf_loss: 0.5003543496131897
val/loss: 0.8077865242958069
val/mlm_loss: 1.7993029356002808


Best checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/task-GSPelt-epoch=04-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/GSPelt-epoch=05.ckpt


Source genre: government


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Target genre: slate
Number of target samples: 69575


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.48890504240989685, 'source_test/accuracy': 0.8452620506286621, 'source_test/f1': 0.8442047834396362, 'source_test/f1_macro': 0.8407618403434753, 'source_test/f1_micro': 0.8452620506286621, 'target_test/loss': 0.7855111360549927, 'target_test/accuracy': 0.7479838728904724, 'target_test/f1': 0.7477769255638123, 'target_test/f1_macro': 0.7371618151664734, 'target_test/f1_micro': 0.7479838728904724}]
Best checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/task-GSPelt-epoch=04-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/GSPelt-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.4599556028842926, 'source_test/accuracy': 0.8296370506286621, 'source_test/f1': 0.8284703493118286, 'source_test/f1_macro': 0.8243982791900635, 'source_test/f1_micro': 0.8296370506286621, 'target_test/loss': 0.6998555660247803, 'target_test/accuracy': 0.7328628897666931, 'target_test/f1': 0.7330861687660217, 'target_test/f1_macro': 0.7235634326934814, 'target_test/f1_micro': 0.7328628897666931}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.48537078499794006, 'source_test/accuracy': 0.8301411271095276, 'source_test/f1': 0.8284362554550171, 'source_test/f1_macro': 0.8260228037834167, 'source_test/f1_micro': 0.8301411271095276, 'target_test/loss': 0.7539593577384949, 'target_test/accuracy': 0.7197580337524414, 'target_test/f1': 0.7201035618782043, 'target_test/f1_macro': 0.7112918496131897, 'target_test/f1_micro': 0.7197580337524414}]
Batch size: 32


Source genre: government


Target genre: slate
Number of target samples: 69575


Source genre: government
Target genre: slate
Number of target samples: 69575
Source dataset length: 69615
Target dataset length: 21585


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.375
val/f1: 0.4402129054069519
val/taskclf_loss: 1.096038818359375
val/loss: 1.2898112535476685
val/mlm_loss: 1.9147578477859497


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7933659553527832
val/f1: 0.793043315410614
val/taskclf_loss: 0.5242102146148682
val/loss: 0.8364081382751465
val/mlm_loss: 1.8432954549789429


Validation: |                                                                                                 …

val/accuracy: 0.8122192621231079
val/f1: 0.8119698166847229
val/taskclf_loss: 0.48063305020332336
val/loss: 0.8021019101142883
val/mlm_loss: 1.8388895988464355


Validation: |                                                                                                 …

val/accuracy: 0.8109279274940491
val/f1: 0.8104115724563599
val/taskclf_loss: 0.4870326519012451
val/loss: 0.8051393032073975
val/mlm_loss: 1.8310831785202026


Validation: |                                                                                                 …

val/accuracy: 0.8304268717765808
val/f1: 0.8300024271011353
val/taskclf_loss: 0.46278101205825806
val/loss: 0.7818979024887085
val/mlm_loss: 1.8111000061035156


Validation: |                                                                                                 …

val/accuracy: 0.8162223696708679
val/f1: 0.8157682418823242
val/taskclf_loss: 0.49255067110061646
val/loss: 0.8044144511222839
val/mlm_loss: 1.8102236986160278


Validation: |                                                                                                 …

val/accuracy: 0.8300899863243103
val/f1: 0.8299031853675842
val/taskclf_loss: 0.48204585909843445
val/loss: 0.797220766544342
val/mlm_loss: 1.813709020614624


Validation: |                                                                                                 …

val/accuracy: 0.8195797801017761
val/f1: 0.8187251091003418
val/taskclf_loss: 0.4946114122867584
val/loss: 0.8071821928024292
val/mlm_loss: 1.8152719736099243


Validation: |                                                                                                 …

val/accuracy: 0.8346377015113831
val/f1: 0.8339921832084656
val/taskclf_loss: 0.47263872623443604
val/loss: 0.787080705165863
val/mlm_loss: 1.8012051582336426


Validation: |                                                                                                 …

val/accuracy: 0.8343794345855713
val/f1: 0.83389812707901
val/taskclf_loss: 0.47677314281463623
val/loss: 0.7969892024993896
val/mlm_loss: 1.8297359943389893


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8355416059494019
val/f1: 0.8350979685783386
val/taskclf_loss: 0.47703975439071655
val/loss: 0.7907185554504395
val/mlm_loss: 1.8023817539215088


Best checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/task-GSPelt-epoch=03-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/GSPelt-epoch=05.ckpt


Source genre: government
Target genre: slate
Number of target samples: 69575


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.4866653382778168, 'source_test/accuracy': 0.8351814150810242, 'source_test/f1': 0.8342761397361755, 'source_test/f1_macro': 0.8307391405105591, 'source_test/f1_micro': 0.8351814150810242, 'target_test/loss': 0.7541588544845581, 'target_test/accuracy': 0.7379031777381897, 'target_test/f1': 0.7378646731376648, 'target_test/f1_macro': 0.7275900840759277, 'target_test/f1_micro': 0.7379031777381897}]
Best checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/task-GSPelt-epoch=03-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/GSPelt-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.4621727168560028, 'source_test/accuracy': 0.8200604319572449, 'source_test/f1': 0.8191884756088257, 'source_test/f1_macro': 0.8150218725204468, 'source_test/f1_micro': 0.8200604319572449, 'target_test/loss': 0.701334536075592, 'target_test/accuracy': 0.7293346524238586, 'target_test/f1': 0.7295129299163818, 'target_test/f1_macro': 0.719728410243988, 'target_test/f1_micro': 0.7293346524238586}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.481741338968277, 'source_test/accuracy': 0.8220765590667725, 'source_test/f1': 0.8208621740341187, 'source_test/f1_macro': 0.8173877000808716, 'source_test/f1_micro': 0.8220765590667725, 'target_test/loss': 0.7322108149528503, 'target_test/accuracy': 0.7313507795333862, 'target_test/f1': 0.7320983409881592, 'target_test/f1_macro': 0.7210254669189453, 'target_test/f1_micro': 0.7313507795333862}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.4814300537109375, 0.48890504240989685, 0.4866653382778168], 'source_test/accuracy': [0.8452620506286621, 0.8452620506286621, 0.8351814150810242], 'source_test/f1': [0.844123125076294, 0.8442047834396362, 0.8342761397361755], 'source_test/f1_macro': [0.8417439460754395, 0.8407618403434753, 0.8307391405105591], 'source_test/f1_micro': [0.8452620506286621, 0.8452620506286621, 0.8351814150810242], 'target_test/loss': [0.8109395503997803, 0.7855111360549927, 0.7541588544845581], 'target_test/accuracy': [0.7368951439857483, 0.7479838728904724, 0.7379031777381897], 'target_test/f1': [0.7363978624343872, 0.7477769255638123, 0.7378646731376648], 'target_test/f1_macro': [0.727328896522522, 0.7371618151664734, 0.7275900840759277], 'target_test/f1_micro': [0.7368951439857483, 0.7479838728904724, 0.7379031777381897]}), ('best_model', {'source_test/loss': [0.4733338952064514, 0.4599556028842926, 0.4621727168560028], 'source_test/accuracy': [0.831149

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.48566681146621704, 'source_test/accuracy': 0.8419018387794495, 'source_test/f1': 0.8408680160840353, 'source_test/f1_macro': 0.8377483089764913, 'source_test/f1_micro': 0.8419018387794495, 'target_test/loss': 0.7835365136464437, 'target_test/accuracy': 0.7409273982048035, 'target_test/f1': 0.7406798203786215, 'target_test/f1_macro': 0.7306935985883077, 'target_test/f1_micro': 0.7409273982048035}, 'best_model': {'source_test/loss': 0.4651540716489156, 'source_test/accuracy': 0.826948881149292, 'source_test/f1': 0.8258473674456278, 'source_test/f1_macro': 0.8221385677655538, 'source_test/f1_micro': 0.826948881149292, 'target_test/loss': 0.7136876384417216, 'target_test/accuracy': 0.7300066947937012, 'target_test/f1': 0.7297572692235311, 'target_test/f1_macro': 0.720861812432607, 'target_test/f1_micro': 0.7300066947937012}, 'epoch_saved': {'source_test/loss': 0.48014867305755615, 'source_test/accuracy': 0.8277889490127563, 'source_test/f

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf