In [None]:
!pip install nlpaug nltk
!pip install -q -U bitsandbytes

!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
import torch
import gc
import os
import numpy as np
import random
import nlpaug.augmenter.word as naw
import nltk
from tqdm import tqdm

In [None]:
# Connecting to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
# Data
DATA_DIR = "../Dataset"
MODELS_DIR = "../Modelli_BERT"

# Model
MODEL = 'bert-large-uncased'
EPOCHS = 10
BATCH_SIZE = 8
LEARNING_RATE = 2e-5

# Reproducibility
SEED = 42
set_seed(SEED)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Device selected: {device}")

In [None]:
# Load dataset
df_train = pd.read_csv(f"{DATA_DIR}/train.csv")
df_test = pd.read_csv(f"{DATA_DIR}/valid.csv")


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
df_train = df_train.dropna(subset=['text', 'label', 'variety', 'source', 'task'])
df_test = df_test.dropna(subset=['text', 'label', 'variety', 'source', 'task'])

### Fine-tuning the models

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL)
model = BertForSequenceClassification.from_pretrained(MODEL)

In [None]:
report_data = []

In [None]:
# TRAINING CYCLE
grouped_train = df_train.groupby(['variety', 'source', 'task'])

print(f"Start training on {len(grouped_train)} combinations...")

for (variety, source, task), df_group in grouped_train:
    run_id = f"{variety}_{source}_{task}".replace(" ", "_")
    save_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTraining combination: {run_id} (Samples: {len(df_group)})")


    # Dataset setup
    train_ds = Dataset.from_pandas(df_group.reset_index(drop=True))
    tokenized_train = train_ds.map(tokenize_function, batched=True)


    # Model setup
    num_labels = df_train['label'].nunique()
    model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

    # Setup trainer
    training_args = TrainingArguments(
        output_dir=f"./checkpoints_temp/{run_id}",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        optim="adamw_torch",
        save_strategy="no",
        eval_strategy="no", # no validation
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train
    )

    trainer.train()


    print(f"Saving in: {save_path}")
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    # memory cleaning
    del model, trainer, tokenized_train
    torch.cuda.empty_cache()
    gc.collect()

print("\nTraining completed")

### Test the models

In [None]:
report_data = []

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", pos_label=1)
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
# EVALUATION CYCLE
grouped_val = df_test.groupby(['variety', 'source', 'task'])

print(f"Start validation of {len(grouped_val)} combinations...")

for (variety, source, task), df_group in grouped_val:
    run_id = f"{variety}_{source}_{task}".replace(" ", "_")
    model_path = os.path.join(MODELS_DIR, run_id)

    print(f"\nTesting combination: {run_id} (Samples: {len(df_group)})")

    # Loading the right modeL
    if not os.path.exists(model_path):
        print(f"Model not found {model_path}")
        report_data.append({
            "variety": variety, "source": source, "task": task,
            "status": "Model Missing"
        })
        continue

    val_ds = Dataset.from_pandas(df_group.reset_index(drop=True))
    tokenizer = BertTokenizer.from_pretrained(model_path)

    tokenized_val = val_ds.map(tokenize_function, batched=True)

    model = BertForSequenceClassification.from_pretrained(model_path)


    args = TrainingArguments(
    output_dir="tmp",
    report_to="none",
    logging_strategy="no"
    )

    trainer = Trainer(
        model=model,
        args=args,
        compute_metrics=compute_metrics
    )


    results = trainer.predict(tokenized_val)
    metrics = results.metrics


    print(f"Accuracy: {metrics['test_accuracy']:.4%} | F1: {metrics['test_f1']:.4f}")


    # Saving results report
    report_data.append({
        "variety": variety,
        "source": source,
        "task": task,
        "status": "Success",
        "accuracy": metrics['test_accuracy'],
        "f1": metrics['test_f1'],
        "precision": metrics['test_precision'],
        "recall": metrics['test_recall'],
        "num_samples": len(df_group)
    })

    # Cleaning
    del model, trainer, tokenized_val
    torch.cuda.empty_cache()

# Export final report
report_path = os.path.join(MODELS_DIR, "report_performance_baseline.csv")
df_report = pd.DataFrame(report_data)
df_report.to_csv(report_path, index=False)
print("\nTesting completed")