### Import the required libraries

In [None]:
import os
import sys
import torch
import numpy as np
import logging

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

logging.getLogger().setLevel(logging.INFO)

### Load model and tokenizer

In [None]:
MODEL_NAME = "bert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    else:
        return torch.device('cpu')


device = get_device()
model.to(device)

print(f"Using device: {device}")

### Prepare data for training

In [None]:
TRAIN_DATASET_PATH = os.path.join("data", "train.csv")
VAL_DATASET_PATH = os.path.join("data", "val.csv")

train_dataset = load_dataset('csv', data_files=TRAIN_DATASET_PATH)
val_dataset = load_dataset('csv', data_files=VAL_DATASET_PATH)


In [None]:
def prepare_dataset_for_bert_training(dataset):
    # Rename the 'label' column to 'labels'
    dataset = dataset.rename_column("label", "labels")
    
    # Tokenize the text data in the dataset
    dataset = dataset.map(lambda examples: tokenizer(examples["text"], truncation=True, padding=True), batched=True)
    
    # Subtract 1 from each value in the 'labels' column
    dataset = dataset.map(lambda examples: {"labels": [label - 1 for label in examples["labels"]]}, batched=True)
    
    return dataset

In [None]:
train_dataset_tokenized = prepare_dataset_for_bert_training(train_dataset["train"]).shuffle(42)
val_dataset_tokenized = prepare_dataset_for_bert_training(val_dataset["train"])

print(f"# Train dataset size: {len(train_dataset_tokenized)}")
print(f"# Validation dataset size: {len(val_dataset_tokenized)}")

### Prepare metrics

In [None]:
f1 = load_metric("f1")
precision = load_metric("precision")
recall = load_metric("recall")

def computre_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "f1": f1.compute(predictions=predictions, references=labels, average='weighted')['f1'],
        "precision": precision.compute(predictions=predictions, references=labels, average='weighted')['precision'],
        "recall": recall.compute(predictions=predictions, references=labels, average='weighted')['recall']
    }

### Train model

In [None]:
OUTPUT_DIR = os.path.join("models", "bert_base")
os.environ["WANDB_PROJECT"] = "bert_base_training"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=30,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    run_name="bert_base",
    report_to="wandb",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    compute_metrics=computre_metrics
)

trainer.train()

### Save model

In [None]:
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

### Push to hub

In [1]:
# MODEL_HUB_PATH = "KarlsonAV/bert-base-uncased-tripadvisor"

# model.push_to_hub(MODEL_HUB_PATH)
# tokenizer.push_to_hub(MODEL_HUB_PATH)