### Import the required libraries

In [None]:
import os
import sys
import torch
import numpy as np
import logging

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

logging.getLogger().setLevel(logging.INFO)

### Load model and tokenizer

In [None]:
MODEL_NAME = ["bert-small-uncased","bert-base-uncased","bert-large-uncased"]

models = {MODEL: AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2) for MODEL in MODEL_NAME}
tokenizers = {MODEL: AutoTokenizer.from_pretrained(MODEL) for MODEL in MODEL_NAME}

desired_labels=[1,2,3,4,5]

In [None]:
def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    else:
        return torch.device('cpu')


device = get_device()
for model in models:
    model.to(device)
    print(f"Using device: {device}")

### Prepare data for training

In [None]:
TRAIN_DATASET_PATH = os.path.join("data", "train.csv")
VAL_DATASET_PATH = os.path.join("data", "val.csv")

train_dataset = load_dataset('csv', data_files=TRAIN_DATASET_PATH)
val_dataset = load_dataset('csv', data_files=VAL_DATASET_PATH)

In [None]:
def prepare_dataset_for_bert_training(dataset,model,desired):
    # Rename the 'label' column to 'labels'
    dataset = dataset.rename_column("label", "labels")
    
    # Tokenize the text data in the dataset
    dataset = dataset.map(lambda examples: tokenizers[model](examples["text"], truncation=True, padding=True), batched=True)
    
    # Subtract 1 from each value in the 'labels' column
    dataset = dataset.map(lambda examples: {"labels": [1 if label == desired else 0 for label in examples["labels"]]}, batched=True)
    
    return dataset

In [None]:
for model_name in MODEL_NAME:
    for desired_label in desired_labels:
        train_dataset_tokenized = prepare_dataset_for_bert_training(train_dataset, model_name, desired_label).shuffle(seed=1697)
        val_dataset_tokenized = prepare_dataset_for_bert_training(val_dataset, model_name, desired_label)
        
        print(f"# Train dataset size for model {model_name} and desired label {desired_label}: {len(train_dataset_tokenized)}")
        print(f"# Validation dataset size for model {model_name} and desired label {desired_label}: {len(val_dataset_tokenized)}")

### Prepare metrics

In [None]:
f1 = load_metric("f1")
precision = load_metric("precision")
recall = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "f1": f1.compute(predictions=predictions, references=labels, average='weighted')['f1'],
        "precision": precision.compute(predictions=predictions, references=labels, average='weighted')['precision'],
        "recall": recall.compute(predictions=predictions, references=labels, average='weighted')['recall']
    }

### Train model

In [None]:
for model_name in MODEL_NAME:
    for desired_label in desired_labels:
        train_dataset_tokenized = prepare_dataset_for_bert_training(train_dataset, model_name, desired_label).shuffle(seed=42)
        val_dataset_tokenized = prepare_dataset_for_bert_training(val_dataset, model_name, desired_label)
        
        OUTPUT_DIR = os.path.join("models", f"{model_name}_desired_{desired_label}")
        os.environ["WANDB_PROJECT"] = f"{model_name}_training"
        
        training_args = TrainingArguments(
            output_dir=OUTPUT_DIR,
            num_train_epochs=5,
            learning_rate=2e-5,
            weight_decay=0.01,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            logging_steps=30,
            evaluation_strategy="steps",
            eval_steps=500,
            save_steps=500,
            save_total_limit=3,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            run_name=f"{model_name}_desired_{desired_label}",
            report_to="wandb",
        )
        
        trainer = Trainer(
            model=models[model_name],
            args=training_args,
            train_dataset=train_dataset_tokenized,
            eval_dataset=val_dataset_tokenized,
            compute_metrics=compute_metrics,
        )
        
        trainer.train()

        # Save model and tokenizer
        models[model_name].save_pretrained(OUTPUT_DIR)
        tokenizers[model_name].save_pretrained(OUTPUT_DIR)

### Push to hub

In [1]:
MODEL_HUB_PATH = "Deptage/binaryBertTripAdvisor"
for model_name in MODEL_NAME:
    models[model_name].push_to_hub(MODEL_HUB_PATH)
    tokenizers[model_name].push_to_hub(MODEL_HUB_PATH)