In [1]:
!pip install -U transformers huggingface_hub
!pip install -U accelerate
!pip install wandb

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.42.4
Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 

In [2]:
!pip install omegaconf datasets

Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting antlr4-python3-runtime==4.9.* (from omegaconf)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downl

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from huggingface_hub import HfFolder, notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import yaml

def get_configs(path: str):
    params = yaml.safe_load(open(path, "r", encoding="utf-8"))
    return params

# Init HF dataset 💢💢💢

In [5]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from datasets import Dataset as HFDataset
from omegaconf import OmegaConf, DictConfig


class ABSADataset(Dataset):
    def __init__(self, tokenizer=None, conf: DictConfig = None) -> None:
        super().__init__()
        self.conf = OmegaConf.create(conf)
        self.tokenizer = tokenizer  # tokenizer as transform
        self.dataset = self._create_hf_ds(csv_path=self.conf.model.train.train_dir)  # create HF dataset
        self.dataset = self.dataset.train_test_split(test_size=0.2)

    def __getitem__(self, index):
        pass

    def setup_absa_hf_dataset(self):

        def _tokenize(batch):
            return self.tokenizer(batch["review"], padding=True, truncation=True)

        train_dataset = self.dataset["train"]
        dev_dataset = self.dataset["test"].shard(num_shards=2, index=0)
        test_dataset = self.dataset["test"].shard(num_shards=2, index=1)


        # Apply tokenization and label mapping to the datasets
        train_dataset = train_dataset.map(_tokenize, batched=True)
        dev_dataset = dev_dataset.map(_tokenize, batched=True)
        test_dataset = test_dataset.map(_tokenize, batched=True)

        train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        return train_dataset, dev_dataset, test_dataset

    @staticmethod
    def _create_hf_ds(csv_path: str):
        train_csv = pd.read_csv(csv_path)
        ds = HFDataset.from_pandas(train_csv)
        return ds

    def __len__(self):
        return len(self.dataset)

# Model 4 finetuning 💢💢💢💢

In [6]:
!pip install evaluate
# https://github.com/huggingface/evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, \
    pipeline, AutoConfig
from omegaconf import OmegaConf, DictConfig
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
import evaluate
import numpy as np

class PretrainedModelABSA:
    def __init__(self, conf: DictConfig = None) -> None:
        self.conf = OmegaConf.create(conf)

        # get pretrained model
        self.auto_conf = AutoConfig.from_pretrained(self.conf.model.pretrained.name)
        self.auto_conf.id2label = {i: label for i, label in enumerate(self.conf.model.label_aspects)}
        self.auto_conf.label2id = {label: i for i, label in enumerate(self.conf.model.label_aspects)}
        print(self.auto_conf.label2id)
        self.auto_conf.update({"label2id": self.auto_conf.label2id})
        self.auto_conf.update({"id2label": self.auto_conf.id2label})
        self.auto_conf.num_labels = 11
        self.model, self.tokenizer = self.get_pretrained_model_and_tokenizer()
        self.accucracy_metric, self.precision_metric, self.recall_metric = evaluate.load("accuracy"), evaluate.load("precision"), evaluate.load("recall")


    def get_pretrained_model_and_tokenizer(self):
        # using transformers package to get pretrained model
        model = AutoModelForSequenceClassification.from_pretrained(
            self.conf.model.pretrained.name,
            force_download=True,
            # id2label=self.auto_conf.id2label,
            # label2id=self.auto_conf.label2id,
            config=self.auto_conf
        )  # get pretrained model name

        tokenizer = AutoTokenizer.from_pretrained(self.conf.model.pretrained.name)
        # model.eval()  #

        return model, tokenizer

    def get_model_parameters(self):
        params = sum([p.nelement() for p in self.model.parameters()])
        return params # sum of parameters

    def setup_dataset(self):
        absa = ABSADataset(tokenizer=self.tokenizer, conf=self.conf)
        train_set, dev_set, _ = absa.setup_absa_hf_dataset()

        return train_set, dev_set

    def prepare_trainer4finetuning(self):
        if self.conf.model.pretrained.freeze == True:
            for param in self.model.base_model.parameters():
                param.requires_grad = False


        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            acc = self.accucracy_metric.compute(predictions=predictions, references=labels)
            precision = self.precision_metric.compute(predictions=predictions, references=labels, average='macro')
            recall = self.recall_metric.compute(predictions=predictions, references=labels, average='macro')
            return {"accuracy": acc, "precision": precision, "recall": recall}


        # init dataset
        train_set, dev_set = self.setup_dataset()

        train_args = TrainingArguments(
            output_dir=self.conf.model.train.out_dir,
            num_train_epochs=12,
            per_device_train_batch_size=self.conf.model.train.batch_size,
            per_device_eval_batch_size=self.conf.model.train.batch_size,
            evaluation_strategy="epoch",
            logging_dir=self.conf.model.train.log_dir,
            logging_strategy=self.conf.model.train.log_strategy,
            logging_steps=self.conf.model.train.log_steps,
            learning_rate=self.conf.model.train.lr,
            weight_decay=self.conf.model.train.weight_decay,
            warmup_steps=self.conf.model.train.warmup_step,
            report_to=self.conf.model.train.report_to, # wandb
            push_to_hub=self.conf.model.train.push_to_hub,  # hub
            hub_strategy=self.conf.model.train.hub_strategy,  # hub
            hub_model_id=self.conf.model.train.hub_model_id,  # hub
        )


        trainer = Trainer(
            model=self.model,
            tokenizer=self.tokenizer,
            args=train_args,
            train_dataset=train_set,
            eval_dataset=dev_set,
            compute_metrics=compute_metrics
        )

        return trainer


    @staticmethod
    def model_inference(text: str, model_name):
        # model name must be pretrained model
        tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
        pipe = pipeline(
            "text-classification", model=model_name, tokenizer=tokenizer, device=0
        )
        result = pipe(text)
        predicted_label = result[0]["label"]
        print(predicted_label)

In [None]:
!nvidia-smi

In [8]:
config_path = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/absa_model.yaml"
data_path = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/dataset/gen/gen_ds.csv"

In [9]:
out_dir = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/saved_model"
log_dir = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/log"

# repo of mode hf
hub_model_id = "nguyenanh2803/absa-train-service"

conf = get_configs(config_path)
conf["model"]["pretrained"]["name"] = "FacebookAI/xlm-roberta-base"
conf["model"]["train"]["train_dir"] = data_path
conf["model"]["pretrained"]["freeze"] = False
conf['model']['train']['lr'] = 2e-4
conf['model']['train']['out_dir'] = out_dir
conf['model']['train']['log_dir'] = log_dir
conf['model']['train']['hub_model_id'] = hub_model_id

## Init model object

## Init finetuning model ☕☕☕

In [10]:
iabsa_model = PretrainedModelABSA(conf=conf)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

{'Price fairness': 0, 'Data availability': 1, 'Cleanliness': 2, 'Punctuality': 3, 'Facilities': 4, 'Accessibility': 5, 'Satisfactions': 6, 'Staff service': 7, 'Safety': 8, 'Others': 9}


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
# prompt: get model parameters

params = iabsa_model.get_model_parameters()
print(f"Number of parameters: {params}")


In [None]:
train_set, _ = iabsa_model.setup_dataset()

## Check dataset ☕☕☕☕☕

In [None]:
unique_labels = train_set.unique('label')
unique_labels

## Finetuning XLM Roberta ⚡

In [11]:
trainer = iabsa_model.prepare_trainer4finetuning()

Map:   0%|          | 0/12578 [00:00<?, ? examples/s]

Map:   0%|          | 0/1573 [00:00<?, ? examples/s]

Map:   0%|          | 0/1572 [00:00<?, ? examples/s]



In [12]:
import wandb
import time
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
wandb.init(project=f"absa-{time.time()}")
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mcunho2803032003[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


# Push to hub ⌛⌛⌛⌛

In [None]:
!huggingface-cli whoami

In [None]:
# model card
trainer.create_model_card()

# hub
trainer.push_to_hub()

# Inference ⚡⚡⚡

In [None]:
text = "Great experience"
iabsa_model.model_inference(text, hub_model_id)