In [None]:
import wandb
import random
import math

: 

In [None]:
wandb.login(key="d1209ac0e4a568567e407d3afef99099ea87a4b1")

[34m[1mwandb[0m: Currently logged in as: [33mjohntoro[0m ([33mai4sw[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/huypn16/.netrc


True

In [None]:
%env WANDB_PROJECT=text-sentiment-analysis
%env WANDB_ENTITY=SC4001

env: WANDB_PROJECT=text-sentiment-analysis
env: WANDB_ENTITY=SC4001


In [None]:
import os
import torch
import wandb
import argparse

from torchmetrics import Accuracy, Precision, Recall, F1Score, AUROC

from transformers import (
    Trainer,
    TrainingArguments,
    EvalPrediction,
    AutoModelForSequenceClassification,
    AutoTokenizer
)
from datasets import load_dataset, Dataset, DatasetDict

def train_val_test_split(dataset: Dataset | DatasetDict, seed: int = 42):
    train_dataset = dataset["train"]
    test_dataset = dataset["test"]

    if "validation" in dataset:
        val_dataset = dataset["validation"]
    else:
        train_dataset, val_dataset = train_dataset.train_test_split(test_size=0.3, seed=seed).values()

    return (train_dataset, val_dataset, test_dataset)

def tokenize(dataset: Dataset | DatasetDict, tokenizer_name: str, input_col_name: str = "text"):
    def _tokenize(examples):
        return tokenizer(examples[input_col_name], padding='max_length', truncation=True, max_length=512)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenized_datasets = dataset.map(_tokenize, batched=True).select_columns(["input_ids", "attention_mask", "label"]).with_format("torch")
    return tokenized_datasets

def subset_dataset(dataset: Dataset | DatasetDict,
                   size: int,
                   seed: int = 42):
    shuffled_dataset = dataset.shuffle(seed=seed)
    new_dataset = shuffled_dataset.select(range(size))
    return new_dataset


# default optimizer: AdamW
training_args = TrainingArguments(
    output_dir='./results', # output directory of results
    num_train_epochs=3, # number of train epochs
    report_to='wandb', # enable logging to W&B
    evaluation_strategy='steps', # check evaluation metrics at each epoch
    logging_steps = 10, # we will log every 10 steps
    eval_steps = 200, # we will perform evaluation every 200 steps
    save_steps = 200, # we will save the model every 200 steps
    save_total_limit = 5, # we only save the last 5 checkpoints (including the best one)
    load_best_model_at_end = True, # we will load the best model at the end of training
    metric_for_best_model = 'accuracy', # metric to see which model is better
    # deepspeed=config, # deep speed integration
    #### effective batch_size = per_device_train_batch_size x gradient_accumulation_steps ####
    #### We set effective batch_size to 32 (8 x 4) ####
    per_device_train_batch_size=int(8 / torch.cuda.device_count()), # batch size per device
    per_device_eval_batch_size=int(8 / torch.cuda.device_count()), # eval batch size per device
    gradient_accumulation_steps=4, # gradient accumulation
)


def compute_metrics(pred: EvalPrediction):
    # Extract labels and predictions
    labels = pred.label_ids
    preds = pred.predictions

    # for t5 model, the predictions is in the form of a tuple with the logits as the only element in the tuple
    if isinstance(preds, tuple):
        preds = preds[0]

    num_classes = preds.shape[1]

    # Convert to torch tensors
    labels = torch.tensor(labels)
    preds = torch.tensor(preds)

    # Initialize metrics
    accuracy = Accuracy(task="multiclass", num_classes=num_classes).to(torch.cuda.current_device())
    precision = Precision(task="multiclass", num_classes=num_classes).to(torch.cuda.current_device())
    recall = Recall(task="multiclass", num_classes=num_classes).to(torch.cuda.current_device())
    f1 = F1Score(task="multiclass", num_classes=num_classes).to(torch.cuda.current_device())
    auroc = AUROC(task="multiclass", num_classes=num_classes).to(torch.cuda.current_device())

    # Calculate metrics (automatically does argmax)
    accuracy_score = accuracy(preds, labels)
    precision_score = precision(preds, labels)
    recall_score = recall(preds, labels)
    f1_score = f1(preds, labels)
    auroc_score = auroc(preds, labels)


    # Convert to CPU for serialization
    return {
        "accuracy": accuracy_score.cpu().item(),
        "precision": precision_score.cpu().item(),
        "recall": recall_score.cpu().item(),
        "f1": f1_score.cpu().item(),
        "auroc": auroc_score.cpu().item(),
    }

class CustomTrainer(Trainer):
    def __init__(self, *args, run_name: str = None, trainer_args: TrainingArguments = None, **kwargs):
        if not trainer_args:
            # set default training arguments if not supplied
            trainer_args = training_args
        if run_name:
            trainer_args.run_name = run_name # specify the run name for wandb logging
        super().__init__(*args, compute_metrics=compute_metrics, args=trainer_args, **kwargs)

    def compute_loss(self, model, inputs, **kwargs):
        """
        Override the default compute_loss.
        Use Cross Entropy Loss for multiclass classification (>= 2).
        """
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute cross entropy loss
        loss_func = torch.nn.CrossEntropyLoss()
        loss = loss_func(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        if kwargs.get("return_outputs") is not None:
          return (loss, outputs)
        else:
          return loss

        # return (loss, outputs) if return_outputs else loss




In [None]:
def main(dataset_name, model):
    # parser = argparse.ArgumentParser(description='Small dataset experiments')
    # parser.add_argument("--dataset", choices=['imdb', 'yelp', 'sst2', 'rotten_tomatoes'], default='imdb', help="Dataset to use")
    # parser.add_argument("--model", choices=['bert', 'gpt2', 't5'], default='bert', help='Model to use')
    subset_yelp = True
    run_name = f"{model}-CompareTransformers-{dataset_name}"

    # if args.subset_yelp:
        # run_name += "_subset"

    # set up dataset
    if dataset_name == 'imdb':
        dataset = load_dataset("imdb")
        num_labels = 2
        input_col_name = "text"
    elif dataset_name =='yelp':
        dataset = load_dataset("yelp_review_full")
        num_labels = 5
        input_col_name = "text"
    elif dataset_name == 'sst2':
        dataset = load_dataset("sst2")
        num_labels = 2
        input_col_name = "sentence"
    elif dataset_name == "rotten_tomatoes":
        dataset = load_dataset("rotten_tomatoes")
        num_labels = 2
        input_col_name = "text"
    else:
        raise NotImplementedError

    # set up model
    if model == 'bert':
        model_name = "google-bert/bert-base-uncased"
    elif model == 'gpt':
        model_name = "gpt2"
    elif model == 't5':
        model_name = "t5-base"
    else:
        raise NotImplementedError
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    if model.config.pad_token_id == None:
        model.config.pad_token_id = model.config.eos_token_id

    tokenized_datasets = tokenize(dataset, model_name, input_col_name=input_col_name)
    train_dataset, val_dataset, test_dataset = train_val_test_split(tokenized_datasets)

    if dataset_name =='yelp':
        train_dataset = subset_dataset(train_dataset, size=25_000, seed=42)
        val_dataset = subset_dataset(val_dataset, size=25_000, seed=42)
        test_dataset = subset_dataset(test_dataset, size=25_000, seed=42)

    trainer = CustomTrainer(
        run_name=run_name,
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()

In [14]:
main(dataset_name="imdb", model="bert")

Path, databilder:  imdb


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

[2024-11-18 03:17:36,699] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: /usr/local/cuda-12.5/lib64/libcufile.so: undefined reference to `dlvsym'
/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: /usr/local/cuda-12.5/lib64/libcufile.so: undefined reference to `dlopen'
/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: /usr/local/cuda-12.5/lib64/libcufile.so: undefined reference to `dlclose'
/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: /usr/local/cuda-12.5/lib64/libcufile.so: undefined reference to `dlerror'
/datadrive5/huypn16/anaconda3/envs/ana/compiler_compat/ld: /usr/local/cuda-12.5/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auroc
200,0.1942,0.287564,0.894933,0.894933,0.894933,0.894933,0.968265
400,0.2046,0.212621,0.9208,0.9208,0.9208,0.9208,0.976423
600,0.0969,0.230426,0.923867,0.923867,0.923867,0.923867,0.978179
800,0.0849,0.257355,0.924,0.924,0.924,0.924,0.978525
1000,0.1546,0.234859,0.9256,0.9256,0.9256,0.9256,0.979773
1200,0.0097,0.278169,0.9268,0.9268,0.9268,0.9268,0.980224
1400,0.03,0.320818,0.927333,0.927333,0.927333,0.927333,0.979482
1600,0.0812,0.293192,0.9316,0.9316,0.9316,0.9316,0.980442


In [None]:
main(dataset_name="imdb", model="gpt")

In [None]:
main(dataset_name="imdb", model="t5")