In [2]:
!pip install -U transformers huggingface_hub
!pip install -U accelerate
!pip install wandb

Collecting transformers
  Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.42.4
Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 

In [3]:
!pip install omegaconf datasets

Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting antlr4-python3-runtime==4.9.* (from omegaconf)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downl

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from huggingface_hub import HfFolder, notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import yaml

def get_configs(path: str):
    params = yaml.safe_load(open(path, "r", encoding="utf-8"))
    return params

# Init HF dataset 💢💢💢

In [5]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from datasets import Dataset as HFDataset
from omegaconf import OmegaConf, DictConfig


class ABSADataset(Dataset):
    def __init__(self, tokenizer=None, conf: DictConfig = None) -> None:
        super().__init__()
        self.conf = OmegaConf.create(conf)
        self.tokenizer = tokenizer  # tokenizer as transform
        self.dataset = self._create_hf_ds(csv_path=self.conf.model.train.train_dir)  # create HF dataset
        self.dataset = self.dataset.train_test_split(test_size=0.2)

    def __getitem__(self, index):
        pass

    def setup_absa_hf_dataset(self):

        def _tokenize(batch):
            return self.tokenizer(batch["review"], padding=True, truncation=True)

        train_dataset = self.dataset["train"]
        dev_dataset = self.dataset["test"].shard(num_shards=2, index=0)
        test_dataset = self.dataset["test"].shard(num_shards=2, index=1)


        # Apply tokenization and label mapping to the datasets
        train_dataset = train_dataset.map(_tokenize, batched=True)
        dev_dataset = dev_dataset.map(_tokenize, batched=True)
        test_dataset = test_dataset.map(_tokenize, batched=True)

        train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        dev_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
        test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

        return train_dataset, dev_dataset, test_dataset

    @staticmethod
    def _create_hf_ds(csv_path: str):
        train_csv = pd.read_csv(csv_path)
        ds = HFDataset.from_pandas(train_csv)
        return ds

    def __len__(self):
        return len(self.dataset)

# Model 4 finetuning 💢💢💢💢

In [6]:
!pip install evaluate
# https://github.com/huggingface/evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, \
    pipeline, AutoConfig
from omegaconf import OmegaConf, DictConfig
from transformers import Trainer, TrainingArguments
import torch.nn.functional as F
import evaluate
import numpy as np

class PretrainedModelABSA:
    def __init__(self, conf: DictConfig = None) -> None:
        self.conf = OmegaConf.create(conf)

        # get pretrained model
        self.auto_conf = AutoConfig.from_pretrained(self.conf.model.pretrained.name)
        self.auto_conf.id2label = {i: label for i, label in enumerate(self.conf.model.label_aspects)}
        self.auto_conf.label2id = {label: i for i, label in enumerate(self.conf.model.label_aspects)}
        print(self.auto_conf.label2id)
        self.auto_conf.update({"label2id": self.auto_conf.label2id})
        self.auto_conf.update({"id2label": self.auto_conf.id2label})
        self.auto_conf.num_labels = 11
        self.model, self.tokenizer = self.get_pretrained_model_and_tokenizer()
        self.metric = evaluate.load("accuracy")


    def get_pretrained_model_and_tokenizer(self):
        # using transformers package to get pretrained model
        model = AutoModelForSequenceClassification.from_pretrained(
            self.conf.model.pretrained.name,
            # id2label=self.auto_conf.id2label,
            # label2id=self.auto_conf.label2id,
            config=self.auto_conf
        )  # get pretrained model name

        tokenizer = AutoTokenizer.from_pretrained(self.conf.model.pretrained.name)
        # model.eval()  #

        return model, tokenizer

    def get_model_parameters(self):
        params = sum([p.nelement() for p in self.model.parameters()])
        return params # sum of parameters

    def setup_dataset(self):
        absa = ABSADataset(tokenizer=self.tokenizer, conf=self.conf)
        train_set, dev_set, test_set = absa.setup_absa_hf_dataset()

        return train_set, dev_set, test_set

    def prepare_trainer4finetuning(self):
        if self.conf.model.pretrained.freeze == True:
            for param in self.model.base_model.parameters():
                param.requires_grad = False

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            acc = self.metric.compute(predictions=predictions, references=labels)
            return acc

        # init dataset
        train_set, dev_set, test_set = self.setup_dataset()

        train_args = TrainingArguments(
            output_dir=self.conf.model.train.out_dir,
            num_train_epochs=self.conf.model.train.epoch,
            per_device_train_batch_size=self.conf.model.train.batch_size,
            per_device_eval_batch_size=self.conf.model.train.batch_size,
            evaluation_strategy="epoch",
            logging_dir=self.conf.model.train.log_dir,
            logging_strategy=self.conf.model.train.log_strategy,
            logging_steps=self.conf.model.train.log_steps,
            learning_rate=self.conf.model.train.lr,
            weight_decay=self.conf.model.train.weight_decay,
            warmup_steps=self.conf.model.train.warmup_step,
            report_to=self.conf.model.train.report_to, # wandb
            push_to_hub=self.conf.model.train.push_to_hub,  # hub
            hub_strategy=self.conf.model.train.hub_strategy,  # hub
            hub_model_id=self.conf.model.train.hub_model_id,  # hub
        )


        trainer = Trainer(
            model=self.model,
            tokenizer=self.tokenizer,
            args=train_args,
            train_dataset=train_set,
            eval_dataset=dev_set,
            compute_metrics=compute_metrics
        )

        return trainer

In [15]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [16]:
config_path = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/absa_model.yaml"
data_path = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/dataset/ate/ate-manifest.csv"

In [17]:
out_dir = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/saved_model"
log_dir = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/log"

# repo of mode hf
hub_model_id = "nguyenanh2803/absa-train-service"

conf = get_configs(config_path)
conf["model"]["pretrained"]["name"] = "FacebookAI/xlm-roberta-base"
conf["model"]["train"]["train_dir"] = data_path
conf["model"]["pretrained"]["freeze"] = True
conf['model']['train']['lr'] = 2e-4
conf['model']['train']['out_dir'] = out_dir
conf['model']['train']['log_dir'] = log_dir
conf['model']['train']['hub_model_id'] = hub_model_id

## Init model object

## Init finetuning model ☕☕☕

In [18]:
iabsa_model = PretrainedModelABSA(conf=conf)

{'Price': 0, 'Data availability': 1, 'Cleanliness': 2, 'Punctuality': 3, 'Reliability': 4, 'Accidents': 5, 'Maintenance': 6, 'Handling complaints': 7, 'Satisfactions': 8, 'User-friendly payment system': 9, 'Safety': 10}


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# prompt: get model parameters

params = iabsa_model.get_model_parameters()
print(f"Number of parameters: {params}")


Number of parameters: 278052107


In [20]:
train_set, _, _ = iabsa_model.setup_dataset()

Map:   0%|          | 0/3145 [00:00<?, ? examples/s]

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

Map:   0%|          | 0/393 [00:00<?, ? examples/s]

## Check dataset ☕☕☕☕☕

In [22]:
unique_labels = train_set.unique('label')
unique_labels

[1, 8, 10, 7, 6, 2, 3, 9, 0, 4, 5]

## Finetuning XLM Roberta ⚡

In [23]:
trainer = iabsa_model.prepare_trainer4finetuning()

Map:   0%|          | 0/3145 [00:00<?, ? examples/s]

Map:   0%|          | 0/394 [00:00<?, ? examples/s]

Map:   0%|          | 0/393 [00:00<?, ? examples/s]



In [24]:
import wandb
import time
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [25]:
wandb.init(project=f"absa-{time.time()}")
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mcunho2803032003[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy
1,2.0621,2.102463,0.215736
2,1.7775,1.81474,0.505076
3,1.5351,1.593505,0.545685
4,1.4729,1.519222,0.520305


TrainOutput(global_step=1576, training_loss=1.7539704995712049, metrics={'train_runtime': 2670.2252, 'train_samples_per_second': 4.711, 'train_steps_per_second': 0.59, 'total_flos': 310039249630926.0, 'train_loss': 1.7539704995712049, 'epoch': 4.0})

# Push to hub ⌛⌛⌛⌛

In [26]:
!huggingface-cli whoami

nguyenanh2803


In [27]:
# save tokenizer
# tokenizer = iabsa_model.tokenizer
# tokenizer.save_pretrained(conf['model']['train']['hub_model_id'])

# model card
trainer.create_model_card()

# hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/nguyenanh2803/absa-train-service/commit/c27111b6bfc4b1a0e763195ed04f4118bc649de3', commit_message='End of training', commit_description='', oid='c27111b6bfc4b1a0e763195ed04f4118bc649de3', pr_url=None, pr_revision=None, pr_num=None)

In [39]:
from transformers import pipeline
# from datasets import load_dataset

pip = pipeline('text-classification', conf['model']['train']['hub_model_id'])

Predicted label: Satisfactions


In [54]:
text = "Very clean"
result = pip(text)

predicted_label = result[0]["label"]
print(f"Predicted label: {predicted_label}")

Predicted label: Cleanliness
