In [None]:
if True:
    %rm -rf source
    !git clone https://github.com/HaiDang2001VN/albert-imdb.git source
    %pip install -U wandb transformers evaluate huggingface_hub accelerate
    colab = True
else:
    print("Not running on Google Colab, skip this cell!")
    colab = False

Cloning into 'source'...
remote: Enumerating objects: 224, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (108/108), done.[K
remote: Total 224 (delta 116), reused 217 (delta 111), pack-reused 0[K
Receiving objects: 100% (224/224), 2.52 MiB | 8.36 MiB/s, done.
Resolving deltas: 100% (116/116), done.
Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-0.2

In [None]:
import json
from pprint import pprint

# Load path to pre-processed data in configs folder
config_file = "vlsp_colab" if colab else "vlsp_local"
with open(f"configs/{config_file}.json") as f:
    configs = json.load(f)

pprint(configs)

In [None]:
import wandb

run_model = ["phobert", "base", "format0"]
wandb.init(project="advanced_ai_imdb_dataset", name=f"{run_model[0]}-{run_model[1]}-run-vlsp-{run_model[2]}")

In [None]:
from datasets import load_dataset

source_path = configs["path"]
format_name = run_model[2]
# format_name = "format1"
# train_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/train.jsonl", split="train"
# )
# val_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/dev.jsonl", split="train"
# )
# test_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/test.jsonl", split="train"
# )
dataset = load_dataset(
    "json",
    data_files={
        "train": f"{source_path}/vlsp_preprocessed/{format_name}/train.jsonl",
        "val": f"{source_path}/vlsp_preprocessed/{format_name}/dev.jsonl",
        "test": f"{source_path}/vlsp_preprocessed/{format_name}/test.jsonl",
    },
)

In [None]:
model_path = configs['model_path'][run_model[0]][run_model[1]]
print(model_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=4,
                                                           output_hidden_states=False
                                                           )

In [None]:
def preprocess_token(example):
    return tokenizer(
        example["sentence"], padding="max_length", truncation=True, return_tensors="pt"
    )

In [None]:
tokenized_dataset = dataset.map(preprocess_token, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    print(f"Type of {type(predictions)=}")
    print(f"Type of {type(labels)=}")
    print(f"Shape of {predictions[0].shape}")
    print(f"Shape of {predictions[1].shape}")
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predictions = np.argmax(predictions, axis=1)
    # predictions = np.argmax(predictions.reshape(-1, predictions.shape[-1]), axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
        "f1_macro": f1.compute(predictions=predictions, references=labels, average="macro")['f1'],
        "f1_micro": f1.compute(predictions=predictions, references=labels, average="micro")['f1'],
        "f1_weighted": f1.compute(predictions=predictions, references=labels, average="weighted")['f1'],
        "precision_macro": precision.compute(predictions=predictions, references=labels, average="macro")['precision'],
        "precision_micro": precision.compute(predictions=predictions, references=labels, average="micro")['precision'],
        "precision_weighted": precision.compute(predictions=predictions, references=labels, average="weighted")['precision'],
        "recall_macro": recall.compute(predictions=predictions, references=labels, average="macro")['recall'],
        "recall_micro": recall.compute(predictions=predictions, references=labels, average="micro")['recall'],
        "recall_weighted": recall.compute(predictions=predictions, references=labels, average="weighted")['recall'],
    }

In [None]:
id2label = {0: 'AFFILIATION', 1: 'PART – WHOLE', 2: 'LOCATED', 3: 'PERSONAL - SOCIAL'}
label2id = {value: key for key, value in id2label.items()}
print(label2id)

{'AFFILIATION': 0, 'PART – WHOLE': 1, 'LOCATED': 2, 'PERSONAL - SOCIAL': 3}


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="albert_imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
    save_steps=0.1,
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    save_total_limit=1,
    evaluation_strategy="steps",
    eval_steps=0.1,
    logging_steps=0.05,
    report_to="wandb",
    gradient_accumulation_steps=5,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
tokenized_dataset["train"][0]

{'sentence': 'Trong ảnh : Nghệ_thuật tạo hoa_văn trên trang_phục truyền_thống của người Mông hoa , tại <location> xã Sa_Lông </location> , <location> huyện Mường_Chà </location> .',
 'label': 2,
 'input_ids': [2,
  13,
  38,
  14271,
  40,
  252,
  13,
  45,
  13,
  2723,
  438,
  1,
  38,
  7325,
  38,
  14341,
  20538,
  1,
  2686,
  26407,
  13,
  38,
  13119,
  1,
  3971,
  6335,
  8600,
  8944,
  1,
  96,
  3279,
  13,
  22936,
  13,
  2723,
  13324,
  49,
  21028,
  20538,
  13,
  15,
  5466,
  13,
  1,
  19032,
  1,
  13,
  6791,
  1929,
  1,
  2701,
  13,
  1,
  118,
  19032,
  1,
  13,
  15,
  13,
  1,
  19032,
  1,
  4429,
  8944,
  2832,
  3279,
  1,
  1651,
  13,
  1,
  118,
  19032,
  1,
  13,
  9,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro,F1 Weighted,Precision Macro,Precision Micro,Precision Weighted,Recall Macro,Recall Micro,Recall Weighted
63,1.0975,0.475202,0.841402,0.866019,0.841402,0.843592,0.871212,0.841402,0.851553,0.865719,0.841402,0.841402
126,0.3839,0.432957,0.814691,0.818625,0.814691,0.812231,0.860568,0.814691,0.825216,0.791536,0.814691,0.814691
189,0.2818,0.308675,0.8798,0.896656,0.8798,0.87928,0.902773,0.8798,0.881063,0.892207,0.8798,0.8798
252,0.2004,0.277068,0.898164,0.91224,0.898164,0.897111,0.921658,0.898164,0.899484,0.905633,0.898164,0.898164
315,0.1352,0.30887,0.894825,0.91036,0.894825,0.894403,0.917921,0.894825,0.897033,0.905019,0.894825,0.894825


Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)
Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)
Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)
Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)
Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro,F1 Weighted,Precision Macro,Precision Micro,Precision Weighted,Recall Macro,Recall Micro,Recall Weighted
63,1.0975,0.475202,0.841402,0.866019,0.841402,0.843592,0.871212,0.841402,0.851553,0.865719,0.841402,0.841402
126,0.3839,0.432957,0.814691,0.818625,0.814691,0.812231,0.860568,0.814691,0.825216,0.791536,0.814691,0.814691
189,0.2818,0.308675,0.8798,0.896656,0.8798,0.87928,0.902773,0.8798,0.881063,0.892207,0.8798,0.8798
252,0.2004,0.277068,0.898164,0.91224,0.898164,0.897111,0.921658,0.898164,0.899484,0.905633,0.898164,0.898164
315,0.1352,0.30887,0.894825,0.91036,0.894825,0.894403,0.917921,0.894825,0.897033,0.905019,0.894825,0.894825
378,0.1082,0.332039,0.894825,0.910134,0.894825,0.894178,0.916722,0.894825,0.895615,0.905161,0.894825,0.894825
441,0.0819,0.373421,0.891486,0.907656,0.891486,0.891693,0.91107,0.891486,0.893316,0.905254,0.891486,0.891486


Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)
Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)


TrainOutput(global_step=441, training_loss=0.27899570827311126, metrics={'train_runtime': 3511.8255, 'train_samples_per_second': 14.238, 'train_steps_per_second': 0.178, 'total_flos': 843291718778880.0, 'train_loss': 0.27899570827311126, 'epoch': 17.64})

In [None]:
trainer.evaluate()

Type of type(predictions)=<class 'numpy.ndarray'>
Type of type(labels)=<class 'numpy.ndarray'>
Shape of (4,)
Shape of (4,)


{'eval_loss': 0.27706798911094666,
 'eval_accuracy': 0.8981636060100167,
 'eval_f1_macro': 0.9122397640750749,
 'eval_f1_micro': 0.8981636060100167,
 'eval_f1_weighted': 0.8971114223634937,
 'eval_precision_macro': 0.9216578612895938,
 'eval_precision_micro': 0.8981636060100167,
 'eval_precision_weighted': 0.8994842983190149,
 'eval_recall_macro': 0.9056329585751128,
 'eval_recall_micro': 0.8981636060100167,
 'eval_recall_weighted': 0.8981636060100167,
 'eval_runtime': 22.2396,
 'eval_samples_per_second': 26.934,
 'eval_steps_per_second': 1.709,
 'epoch': 17.64}

In [None]:
trainer.evaluate(tokenized_dataset["test"], metric_key_prefix="test")

In [None]:
predictions = trainer.predict(tokenized_dataset["test"]).label_ids
print(predictions.shape)

In [None]:
# Showing the sentence, label and predicted label
for i in range(10):
    print(tokenized_dataset["test"][i]["sentence"])
    print(tokenized_dataset["test"][i]["label"])
    print(id2label[predictions[i]])

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▃▁▆███▇█
eval/f1_macro,▅▁▇█████
eval/f1_micro,▃▁▆███▇█
eval/f1_weighted,▄▁▇█████
eval/loss,█▇▂▁▂▃▄▁
eval/precision_macro,▂▁▆██▇▇█
eval/precision_micro,▃▁▆███▇█
eval/precision_weighted,▃▁▆███▇█
eval/recall_macro,▆▁▇█████
eval/recall_micro,▃▁▆███▇█

0,1
eval/accuracy,0.89816
eval/f1_macro,0.91224
eval/f1_micro,0.89816
eval/f1_weighted,0.89711
eval/loss,0.27707
eval/precision_macro,0.92166
eval/precision_micro,0.89816
eval/precision_weighted,0.89948
eval/recall_macro,0.90563
eval/recall_micro,0.89816
