In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmp4c7fqwji', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-23 08:48:19.603391: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 08:48:19.635806: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'SF'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "slate_fiction",
            "source_domain": "slate",
            "target_domain": "fiction",
            "domain_adapter_name": "mlm_unipelt_F",
            "task_adapter_name": "SFPelt",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-SFPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="SFPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Source genre: slate
Target genre: fiction
Number of target samples: 69613


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Source genre: slate
Target genre: fiction
Number of target samples: 69613


Map:   0%|          | 0/69613 [00:00<?, ? examples/s]

Map:   0%|          | 0/69613 [00:00<?, ? examples/s]

Map:   0%|          | 0/7735 [00:00<?, ? examples/s]

Map:   0%|          | 0/7735 [00:00<?, ? examples/s]

Source dataset length: 69575
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/amp.py:55: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


Missing logger folder: checkpoints/lightning_logs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.484375
val/f1: 0.5604234337806702
val/taskclf_loss: 1.0936203002929688
val/loss: 1.1874890327453613
val/mlm_loss: 1.5976710319519043


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.711423397064209
val/f1: 0.7114941477775574
val/taskclf_loss: 0.675859808921814
val/loss: 0.8453667759895325
val/mlm_loss: 1.5860681533813477


Validation: |                                                                                                 …

val/accuracy: 0.7402198910713196
val/f1: 0.7406695485115051
val/taskclf_loss: 0.6269205808639526
val/loss: 0.8049742579460144
val/mlm_loss: 1.5830225944519043


Validation: |                                                                                                 …

val/accuracy: 0.7491708397865295
val/f1: 0.7502143383026123
val/taskclf_loss: 0.6105391979217529
val/loss: 0.7915268540382385
val/mlm_loss: 1.5823956727981567


Validation: |                                                                                                 …

val/accuracy: 0.7632462382316589
val/f1: 0.7638319730758667
val/taskclf_loss: 0.5964627861976624
val/loss: 0.7812163829803467
val/mlm_loss: 1.5885415077209473


Validation: |                                                                                                 …

val/accuracy: 0.7573944926261902
val/f1: 0.7574587464332581
val/taskclf_loss: 0.6116183996200562
val/loss: 0.7893868088722229
val/mlm_loss: 1.5661885738372803


Validation: |                                                                                                 …

val/accuracy: 0.7611801028251648
val/f1: 0.7621256113052368
val/taskclf_loss: 0.6057575345039368
val/loss: 0.7859899997711182
val/mlm_loss: 1.5735591650009155


Validation: |                                                                                                 …

val/accuracy: 0.7572653293609619
val/f1: 0.7580931782722473
val/taskclf_loss: 0.6319202780723572
val/loss: 0.8033647537231445
val/mlm_loss: 1.552532434463501


Validation: |                                                                                                 …

val/accuracy: 0.7672968506813049
val/f1: 0.7672949433326721
val/taskclf_loss: 0.6508815288543701
val/loss: 0.8166704177856445
val/mlm_loss: 1.5411245822906494


Validation: |                                                                                                 …

val/accuracy: 0.7665221095085144
val/f1: 0.7667669653892517
val/taskclf_loss: 0.6561902761459351
val/loss: 0.8214271068572998
val/mlm_loss: 1.5434690713882446


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.7678133845329285
val/f1: 0.7681415677070618
val/taskclf_loss: 0.6638045907020569
val/loss: 0.8296457529067993
val/mlm_loss: 1.5543285608291626


Best checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/task-SFPelt-epoch=03-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/SFPelt-epoch=05.ckpt


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Source genre: slate
Target genre: fiction
Number of target samples: 69613


Map:   0%|          | 0/69613 [00:00<?, ? examples/s]

Map:   0%|          | 0/69613 [00:00<?, ? examples/s]

Map:   0%|          | 0/7735 [00:00<?, ? examples/s]

Map:   0%|          | 0/7735 [00:00<?, ? examples/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7030393481254578, 'source_test/accuracy': 0.750912070274353, 'source_test/f1': 0.7503563165664673, 'source_test/f1_macro': 0.7413011193275452, 'source_test/f1_micro': 0.750912070274353, 'target_test/loss': 0.6300202012062073, 'target_test/accuracy': 0.7741455435752869, 'target_test/f1': 0.773404598236084, 'target_test/f1_macro': 0.7648960947990417, 'target_test/f1_micro': 0.7741455435752869}]
Best checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/task-SFPelt-epoch=03-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_0/checkpoints/SFPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6318674087524414, 'source_test/accuracy': 0.7423675060272217, 'source_test/f1': 0.7424419522285461, 'source_test/f1_macro': 0.7305735349655151, 'source_test/f1_micro': 0.7423675060272217, 'target_test/loss': 0.5594415664672852, 'target_test/accuracy': 0.776401698589325, 'target_test/f1': 0.7763603925704956, 'target_test/f1_macro': 0.7657997012138367, 'target_test/f1_micro': 0.776401698589325}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.632343053817749, 'source_test/accuracy': 0.7451276779174805, 'source_test/f1': 0.7459507584571838, 'source_test/f1_macro': 0.7352344393730164, 'source_test/f1_micro': 0.7451276779174805, 'target_test/loss': 0.5544002652168274, 'target_test/accuracy': 0.7829540967941284, 'target_test/f1': 0.7834953665733337, 'target_test/f1_macro': 0.7726526856422424, 'target_test/f1_micro': 0.7829540967941284}]
Batch size: 32


Source genre: slate
Target genre: fiction
Number of target samples: 69613


Source genre: slate
Target genre: fiction
Number of target samples: 69613


Source dataset length: 69575
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.4375
val/f1: 0.5116515159606934
val/taskclf_loss: 1.0921783447265625
val/loss: 1.2338889837265015
val/mlm_loss: 1.8531275987625122


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7115117311477661
val/f1: 0.7120540738105774
val/taskclf_loss: 0.6863815784454346
val/loss: 0.8580490350723267
val/mlm_loss: 1.6081912517547607


Validation: |                                                                                                 …

val/accuracy: 0.739057719707489
val/f1: 0.7389055490493774
val/taskclf_loss: 0.6349140405654907
val/loss: 0.8148931860923767
val/mlm_loss: 1.6013550758361816


Validation: |                                                                                                 …

val/accuracy: 0.7511553764343262
val/f1: 0.751651406288147
val/taskclf_loss: 0.607471764087677
val/loss: 0.787675678730011
val/mlm_loss: 1.5751196146011353


Validation: |                                                                                                 …

val/accuracy: 0.759331464767456
val/f1: 0.760220468044281
val/taskclf_loss: 0.6005590558052063
val/loss: 0.7815935611724854
val/mlm_loss: 1.5726675987243652


Validation: |                                                                                                 …

val/accuracy: 0.7602354288101196
val/f1: 0.761268138885498
val/taskclf_loss: 0.5908769369125366
val/loss: 0.7728375792503357
val/mlm_loss: 1.5679582357406616


Validation: |                                                                                                 …

val/accuracy: 0.762301504611969
val/f1: 0.7631773352622986
val/taskclf_loss: 0.5980818271636963
val/loss: 0.7797850370407104
val/mlm_loss: 1.57378089427948


Validation: |                                                                                                 …

val/accuracy: 0.7673376798629761
val/f1: 0.7678501009941101
val/taskclf_loss: 0.6088904738426208
val/loss: 0.786710798740387
val/mlm_loss: 1.5637391805648804


Validation: |                                                                                                 …

val/accuracy: 0.7641977071762085
val/f1: 0.7647616267204285
val/taskclf_loss: 0.6187922954559326
val/loss: 0.7927384972572327
val/mlm_loss: 1.55283784866333


Validation: |                                                                                                 …

val/accuracy: 0.7744399309158325
val/f1: 0.7750645875930786
val/taskclf_loss: 0.6458688378334045
val/loss: 0.8134634494781494
val/mlm_loss: 1.5458086729049683


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.7715990543365479
val/f1: 0.7721971273422241
val/taskclf_loss: 0.661737859249115
val/loss: 0.8250083923339844
val/mlm_loss: 1.5384583473205566


Best checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/task-SFPelt-epoch=04-val_loss=0.77.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/SFPelt-epoch=05.ckpt


Source genre: slate
Target genre: fiction
Number of target samples: 69613


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7095915079116821, 'source_test/accuracy': 0.7622887492179871, 'source_test/f1': 0.7625941038131714, 'source_test/f1_macro': 0.7515682578086853, 'source_test/f1_micro': 0.7622887492179871, 'target_test/loss': 0.6588518619537354, 'target_test/accuracy': 0.7688412070274353, 'target_test/f1': 0.7686322927474976, 'target_test/f1_macro': 0.7573198676109314, 'target_test/f1_micro': 0.7688412070274353}]
Best checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/task-SFPelt-epoch=04-val_loss=0.77.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_1/checkpoints/SFPelt-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6290572285652161, 'source_test/accuracy': 0.7491839528083801, 'source_test/f1': 0.7504197359085083, 'source_test/f1_macro': 0.7377742528915405, 'source_test/f1_micro': 0.7491839528083801, 'target_test/loss': 0.565384566783905, 'target_test/accuracy': 0.7786818146705627, 'target_test/f1': 0.7794938683509827, 'target_test/f1_macro': 0.7663156390190125, 'target_test/f1_micro': 0.7786818146705627}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.626194179058075, 'source_test/accuracy': 0.7562403678894043, 'source_test/f1': 0.7572768330574036, 'source_test/f1_macro': 0.743891179561615, 'source_test/f1_micro': 0.7562403678894043, 'target_test/loss': 0.5616516470909119, 'target_test/accuracy': 0.7769057154655457, 'target_test/f1': 0.7771899104118347, 'target_test/f1_macro': 0.7660770416259766, 'target_test/f1_micro': 0.7769057154655457}]
Batch size: 32


Source genre: slate


Target genre: fiction
Number of target samples: 69613


Source genre: slate
Target genre: fiction
Number of target samples: 69613
Source dataset length: 69575
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.359375
val/f1: 0.3647216558456421
val/taskclf_loss: 1.0975494384765625
val/loss: 1.2002050876617432
val/mlm_loss: 1.6487836837768555


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7136662602424622
val/f1: 0.7145809531211853
val/taskclf_loss: 0.6766363978385925
val/loss: 0.8485001921653748
val/mlm_loss: 1.599500060081482


Validation: |                                                                                                 …

val/accuracy: 0.7375964522361755
val/f1: 0.7387808561325073
val/taskclf_loss: 0.6342988610267639
val/loss: 0.8110253214836121
val/mlm_loss: 1.583274245262146


Validation: |                                                                                                 …

val/accuracy: 0.7463774681091309
val/f1: 0.7476124167442322
val/taskclf_loss: 0.617650032043457
val/loss: 0.7954667210578918
val/mlm_loss: 1.5724796056747437


Validation: |                                                                                                 …

val/accuracy: 0.7573944926261902
val/f1: 0.7589746117591858
val/taskclf_loss: 0.60505610704422
val/loss: 0.7842205762863159
val/mlm_loss: 1.5671229362487793


Validation: |                                                                                                 …

val/accuracy: 0.7616150379180908
val/f1: 0.7627732753753662
val/taskclf_loss: 0.5990284085273743
val/loss: 0.7811374068260193
val/mlm_loss: 1.5769068002700806


Validation: |                                                                                                 …

val/accuracy: 0.759630560874939
val/f1: 0.7609507441520691
val/taskclf_loss: 0.6102660298347473
val/loss: 0.7902955412864685
val/mlm_loss: 1.5769773721694946


Validation: |                                                                                                 …

val/accuracy: 0.7632462382316589
val/f1: 0.7639670372009277
val/taskclf_loss: 0.6087878942489624
val/loss: 0.7866470217704773
val/mlm_loss: 1.5638452768325806


Validation: |                                                                                                 …

val/accuracy: 0.7624714374542236
val/f1: 0.7640339136123657
val/taskclf_loss: 0.6399323344230652
val/loss: 0.8094297051429749
val/mlm_loss: 1.5500892400741577


Validation: |                                                                                                 …

val/accuracy: 0.7693154215812683
val/f1: 0.7698902487754822
val/taskclf_loss: 0.6641110181808472
val/loss: 0.8290634155273438
val/mlm_loss: 1.5498626232147217


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.7700018286705017
val/f1: 0.7707831859588623
val/taskclf_loss: 0.6714035868644714
val/loss: 0.8355389833450317
val/mlm_loss: 1.5527682304382324


Best checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/task-SFPelt-epoch=04-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/SFPelt-epoch=05.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Source genre: slate
Target genre: fiction
Number of target samples: 69613


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7103971242904663, 'source_test/accuracy': 0.7587365508079529, 'source_test/f1': 0.7598097324371338, 'source_test/f1_macro': 0.7468442320823669, 'source_test/f1_micro': 0.7587365508079529, 'target_test/loss': 0.6367733478546143, 'target_test/accuracy': 0.7788978219032288, 'target_test/f1': 0.7791525721549988, 'target_test/f1_macro': 0.7698794603347778, 'target_test/f1_micro': 0.7788978219032288}]
Best checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/task-SFPelt-epoch=04-val_loss=0.78.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_2/checkpoints/SFPelt-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.6256197690963745, 'source_test/accuracy': 0.7511760592460632, 'source_test/f1': 0.7525800466537476, 'source_test/f1_macro': 0.7384580373764038, 'source_test/f1_micro': 0.7511760592460632, 'target_test/loss': 0.573711633682251, 'target_test/accuracy': 0.7758736610412598, 'target_test/f1': 0.7763957977294922, 'target_test/f1_macro': 0.7662782669067383, 'target_test/f1_micro': 0.7758736610412598}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.6394801735877991, 'source_test/accuracy': 0.7544403076171875, 'source_test/f1': 0.7553904056549072, 'source_test/f1_macro': 0.742258608341217, 'source_test/f1_micro': 0.7544403076171875, 'target_test/loss': 0.5681809186935425, 'target_test/accuracy': 0.7753696441650391, 'target_test/f1': 0.7761570811271667, 'target_test/f1_macro': 0.7659206986427307, 'target_test/f1_micro': 0.7753696441650391}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.7030393481254578, 0.7095915079116821, 0.7103971242904663], 'source_test/accuracy': [0.750912070274353, 0.7622887492179871, 0.7587365508079529], 'source_test/f1': [0.7503563165664673, 0.7625941038131714, 0.7598097324371338], 'source_test/f1_macro': [0.7413011193275452, 0.7515682578086853, 0.7468442320823669], 'source_test/f1_micro': [0.750912070274353, 0.7622887492179871, 0.7587365508079529], 'target_test/loss': [0.6300202012062073, 0.6588518619537354, 0.6367733478546143], 'target_test/accuracy': [0.7741455435752869, 0.7688412070274353, 0.7788978219032288], 'target_test/f1': [0.773404598236084, 0.7686322927474976, 0.7791525721549988], 'target_test/f1_macro': [0.7648960947990417, 0.7573198676109314, 0.7698794603347778], 'target_test/f1_micro': [0.7741455435752869, 0.7688412070274353, 0.7788978219032288]}), ('best_model', {'source_test/loss': [0.6318674087524414, 0.6290572285652161, 0.6256197690963745], 'source_test/accuracy': [0.74236750

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.7076759934425354, 'source_test/accuracy': 0.7573124567667643, 'source_test/f1': 0.7575867176055908, 'source_test/f1_macro': 0.7465712030728658, 'source_test/f1_micro': 0.7573124567667643, 'target_test/loss': 0.6418818036715189, 'target_test/accuracy': 0.7739615241686503, 'target_test/f1': 0.7737298210461935, 'target_test/f1_macro': 0.7640318075815836, 'target_test/f1_micro': 0.7739615241686503}, 'best_model': {'source_test/loss': 0.628848135471344, 'source_test/accuracy': 0.747575839360555, 'source_test/f1': 0.7484805782636007, 'source_test/f1_macro': 0.7356019417444865, 'source_test/f1_micro': 0.747575839360555, 'target_test/loss': 0.5661792556444804, 'target_test/accuracy': 0.7769857247670492, 'target_test/f1': 0.7774166862169901, 'target_test/f1_macro': 0.7661312023798624, 'target_test/f1_micro': 0.7769857247670492}, 'epoch_saved': {'source_test/loss': 0.6326724688212076, 'source_test/accuracy': 0.751936117808024, 'source_test/f1':

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf