#BERT and RoBERTa training and testing

# Imports

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,TrainerCallback,AutoModel,AutoModelForSequenceClassification,BertConfig as BertConfig
import torch
import wandb
import evaluate
import gc

accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

# Dataset


In [6]:
def import_paths_and_nlabels(dataset_name,model_name,paragraph_selection_strategy):
    if dataset_name not in ['asylex-outcome','asylex-norp','sentiment1','sentiment2']:
        raise ValueError('dataset name not found')
    if model_name not in ['bert','roberta']:
        raise ValueError('model name not found')
    if paragraph_selection_strategy not in ['first','last','rand','cas','']:
        raise ValueError('long_text technique not found')

    base_dataset_path = 'datasets/' + model_name + '/'
    base_model_path = 'Models/'  + model_name + '/'

    dataset_filename = {'asylex-norp':'norp_','asylex-outcome':'outcome_','sentiment1':'sentiment1','sentiment2':'sentiment2'}
    model_filename = {'bert' : 'BERT512-', 'roberta' : 'RoBERTa512-'}
    model_filename2 = {'asylex-norp':'norp_','asylex-outcome':'out_','sentiment1':'sentiment1','sentiment2':'sentiment2'}

    if dataset_name == 'asylex-norp':
        num_labels = 7
    else:
        num_labels = 2

    dataset_path = base_dataset_path+ dataset_name+ '/' + dataset_filename[dataset_name] + paragraph_selection_strategy + '_'
    model_path = base_model_path + dataset_name + '/' + model_filename[model_name] + model_filename2[dataset_name] +paragraph_selection_strategy
    return dataset_path, model_path, num_labels

In [7]:
dataset_name = 'asylex-outcome'
model_name = 'bert'
paragraph_selection_strategy = 'first'

dataset_path, trained_model_path, n_labels  = import_paths_and_nlabels(dataset_name,model_name,paragraph_selection_strategy)
train_set = load_from_disk(dataset_path+'train_set')
test_set = load_from_disk(dataset_path+'test_set')
validation_set = load_from_disk(dataset_path+'validation_set')

# Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
model = AutoModelForSequenceClassification.from_pretrained(trained_model_path, num_labels=n_labels)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=n_labels)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.49.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/vocab.json
loading file

In [9]:
my_lr = 5e-6
my_epochs = 1

In [None]:
wandb.init(
    project="Testing pc version",
    name= dataset_name+'_'+model_name+'_'+paragraph_selection_strategy,
    config={
        "learining_rate":my_lr,
        "batch_size":16,
        "epochs":my_epochs,
        "note":""
    }
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels)
    recall = recall_metric.compute(predictions=predictions, references=labels)
    results = {**accuracy,**precision,**recall}
    return results

training_args = TrainingArguments(
    output_dir="./results",          # Cartella per salvare i modelli
    eval_strategy="epoch",    # Valutare dopo ogni epoca
    learning_rate=my_lr,             # Learning rate
    per_device_train_batch_size=4, # Dimensione batch per training
    per_device_eval_batch_size=4,  # Dimensione batch per valutazione
    num_train_epochs=my_epochs,             # Numero di epoche
    weight_decay=0.01,              # Regularizzazione
    logging_dir="./logs",           # Cartella per i log
    #logging_steps=3,
    logging_strategy="epoch",
    logging_first_step = True,
    log_level = "info",
    report_to = "wandb",
    save_strategy="epoch",
    save_safetensors= False# Salvataggio del modello dopo ogni epoca
)

# Definire il trainer

trainer = Trainer(
    model=model,                         # Modello BERT
    args=training_args,                  # Argomenti di training
    train_dataset=train_set,         # Dataset di training  #CHANGE IT
    eval_dataset=validation_set,          # Dataset di valutazione
    compute_metrics = compute_metrics
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices


In [None]:
trainer.train()

In [None]:
wandb.finish()

# Testing

In [None]:
trainer.evaluate(eval_dataset=test_set)

The following columns in the Evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: decision_outcome, complete_input_ids, first_sentence, all_sentences, id, info, tokenized_determinations, text. If decision_outcome, complete_input_ids, first_sentence, all_sentences, id, info, tokenized_determinations, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 2260
  Batch size = 16


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0]
[[ 2.394293   -2.0822196 ]
 [ 2.922452   -2.5275872 ]
 [ 2.8120449  -2.4954085 ]
 [ 2.9993873  -2.6536186 ]
 [ 2.3034241  -1.6276193 ]
 [ 2.0530868  -1.6227229 ]
 [ 2.446073   -1.8333882 ]
 [ 2.7447019  -2.4370067 ]
 [ 1.7351594  -1.205076  ]
 [ 2.745955   -2.4742198 ]
 [ 1.0995097  -0.6347982 ]
 [ 1.5846359  -1.1870792 ]
 [-1.9587151   1.1466919 ]
 [ 2.6337495  -2.217937  ]
 [-0.0993906  -0.37179032]
 [ 1.4243127  -1.0530494 ]
 [ 3.0001009  -2.6378834 ]
 [ 2.681757   -2.3700876 ]
 [-1.8172855   0.92359275]
 [ 2.2436287  -1.6903404 ]]
[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0]
0.7775891341256367


{'eval_loss': 0.2977994978427887,
 'eval_model_preparation_time': 0.0032,
 'eval_accuracy': 0.8743362831858407,
 'eval_precision': 0.7775891341256367,
 'eval_recall': 0.7495908346972177,
 'eval_runtime': 69.9178,
 'eval_samples_per_second': 32.324,
 'eval_steps_per_second': 2.031}

In [None]:
save_path = "drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

Configuration saved in drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/config.json
Model weights saved in drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/model.safetensors
tokenizer config file saved in drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/tokenizer_config.json
Special tokens file saved in drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/special_tokens_map.json


('drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/tokenizer_config.json',
 'drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/special_tokens_map.json',
 'drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/vocab.json',
 'drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/merges.txt',
 'drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/added_tokens.json',
 'drive/MyDrive/magistrale/Models/roberta/asylex-outcome/RoBERTa512-out_cas/tokenizer.json')

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='macro')
    recall = recall_metric.compute(predictions=predictions, references=labels,average='macro')
    results = {**accuracy,**precision,**recall}
    return results

def evaluate_combination(dataset_name,model_name,paragraph_selection_strategy):
    dataset_path, trained_model_path, n_labels  = import_paths_and_nlabels(dataset_name,model_name,paragraph_selection_strategy)
    train_set = load_from_disk(dataset_path+'train_set')
    test_set = load_from_disk(dataset_path+'test_set')
    validation_set = load_from_disk(dataset_path+'validation_set')

    tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(trained_model_path, num_labels=n_labels)

    my_lr = 5e-6
    my_epochs = 3

    wandb.init(
        project="Metrics on test set evaluation3",
        name= dataset_name+'_'+model_name+'_'+paragraph_selection_strategy,
        config={
            "learining_rate":my_lr,
            "batch_size":16,
            "epochs":my_epochs,
            "note":""
        }
    )

    training_args = TrainingArguments(
        output_dir="./results",          # Cartella per salvare i modelli
        eval_strategy="epoch",    # Valutare dopo ogni epoca
        learning_rate=my_lr,             # Learning rate
        per_device_train_batch_size=16, # Dimensione batch per training
        per_device_eval_batch_size=16,  # Dimensione batch per valutazione
        num_train_epochs=my_epochs,             # Numero di epoche
        weight_decay=0.01,              # Regularizzazione
        logging_dir="./logs",           # Cartella per i log
        #logging_steps=3,
        logging_strategy="epoch",
        logging_first_step = True,
        log_level = "info",
        report_to = "wandb",
        save_strategy="epoch",
        save_safetensors= False# Salvataggio del modello dopo ogni epoca
    )
    trainer = Trainer(
        model=model,                         # Modello BERT
        args=training_args,                  # Argomenti di training
        train_dataset=train_set,         # Dataset di training  #CHANGE IT
        eval_dataset=validation_set,          # Dataset di valutazione
        compute_metrics = compute_metrics
    )
    trainer.evaluate(eval_dataset=test_set)
    wandb.finish()

def new_metrics_creator():
    for dataset_name in ['asylex-norp','asylex-outcome','sentiment1','sentiment2']:
        for model_name in ['bert','roberta']:
            if dataset_name == 'asylex-outcome' or dataset_name == 'asylex-norp':
                for paragraph_selection_strategy in ['first','last','cas','rand']:
                    print(dataset_name + '_' + model_name +'_'+paragraph_selection_strategy)
                    evaluate_combination(dataset_name,model_name,paragraph_selection_strategy)
            else:
                paragraph_selection_strategy = ''
                print(dataset_name + '_' + model_name +'_'+paragraph_selection_strategy)
                evaluate_combination(dataset_name,model_name,paragraph_selection_strategy)

new_metrics_creator()