In [1]:
import wandb

wandb.init(project="advanced_ai_imdb_dataset", name="alberta-test-run-dang-01")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjoatmon[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import json

# Load path to pre-processed data in configs folder
with open("configs/vlsp_local.json") as f:
    configs = json.load(f)

print(configs)

In [1]:
from datasets import load_dataset

data_path = configs["data_path"]
format_name = "format0"
# format_name = "format1"
# train_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/train.jsonl", split="train"
# )
# val_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/dev.jsonl", split="train"
# )
# test_set = load_dataset(
#     "json", data_files=f"./vlsp_preprocessed/{format_name}/test.jsonl", split="train"
# )
dataset = load_dataset(
    "json",
    data_files={
        "train": f"./vlsp_preprocessed/{format_name}/train.jsonl",
        "val": f"./vlsp_preprocessed/{format_name}/dev.jsonl",
        "test": f"./vlsp_preprocessed/{format_name}/test.jsonl",
    },
)

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=4)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def preprocess_token(example):
    return tokenizer(
        example["sentence"], padding="max_length", truncation=True, return_tensors="pt"
    )

In [4]:
tokenized_dataset = dataset.map(preprocess_token, batched=True)

In [5]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

In [7]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels),
        "precision": precision.compute(predictions=predictions, references=labels),
        "recall": recall.compute(predictions=predictions, references=labels),
    }

In [8]:
id2label = {0: 'AFFILIATION', 1: 'PART – WHOLE', 2: 'LOCATED', 3: 'PERSONAL - SOCIAL'}
label2id = {'AFFILIATION': 0, 'PART – WHOLE': 1, 'LOCATED': 2, 'PERSONAL - SOCIAL': 3}

In [12]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="albert_imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    eval_steps=500,
    save_total_limit=1,
    # report_to="wandb",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [10]:
tokenized_dataset["train"][0]

{'sentence': '<location1/> <sep> <location2/> <sep> Trong ảnh : Nghệ_thuật tạo hoa_văn trên trang_phục truyền_thống của người Mông hoa , tại <location2/> , <location1/> .',
 'label': 2,
 'input_ids': [2,
  13,
  1,
  19032,
  8197,
  1,
  13,
  1,
  18,
  3492,
  1,
  13,
  1,
  19032,
  9298,
  1,
  13,
  1,
  18,
  3492,
  1,
  13,
  38,
  14271,
  40,
  252,
  13,
  45,
  13,
  2723,
  438,
  1,
  38,
  7325,
  38,
  14341,
  20538,
  1,
  2686,
  26407,
  13,
  38,
  13119,
  1,
  3971,
  6335,
  8600,
  8944,
  1,
  96,
  3279,
  13,
  22936,
  13,
  2723,
  13324,
  49,
  21028,
  20538,
  13,
  15,
  5466,
  13,
  1,
  19032,
  9298,
  1,
  13,
  15,
  13,
  1,
  19032,
  8197,
  1,
  13,
  9,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [13]:
trainer.train()

  0%|          | 1/1250 [01:55<40:06:41, 115.61s/it]

KeyboardInterrupt: 

In [None]:
wandb.finish()