In [1]:
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install torch-summary
!pip install graphviz
!pip install torchview
!pip install contractions
!pip install pyspellchecker
!pip install sigopt

!mkdir ./checkpoints
!mkdir ./datasets

import sys, os

user = "HLT-Ghisolfi-Leuzzi-Testa"
repo = "WASSA-2023"
branch = "irene"

if os.path.isdir(repo):
  !rm -rf {repo}

!git clone -b {branch} https://github.com/{user}/{repo}

sys.path.insert(1, repo)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torchsummary import summary
from torchview import draw_graph
import graphviz
graphviz.set_jupyter_format('png')
from torch.optim import AdamW
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
import json
from utils import EMODataset, EmotionsLabelEncoder, compute_metrics

In [7]:
TRAIN_DATA = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/datasets/WASSA23_essay_level_train_preproc.tsv"
VAL_DATA = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/datasets/WASSA23_essay_level_val_preproc.tsv"
DEV_DATA = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/datasets/WASSA23_essay_level_dev_preproc.tsv"

train_df = pd.read_csv(TRAIN_DATA, sep='\t')
val_df = pd.read_csv(VAL_DATA, sep='\t')
dev_df = pd.read_csv(DEV_DATA, sep='\t')

TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
OPT_STEP_SIZE = 1
OPT_GAMMA = 0.9
EPOCHS = 1
RANDOM_STATE = 42
MODEL_NAME = 'bert-base-cased'
NUM_LABELS = 8

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, truncation=True)

LabelEncoder = EmotionsLabelEncoder()
LabelEncoder.fit(train_df.emotion)
y_train = LabelEncoder.encode(train_df.emotion)
y_val = LabelEncoder.encode(val_df.emotion)
y_dev = LabelEncoder.encode(dev_df.emotion)

############################ SUB-SAMPLE ############################
train_df = train_df[:20]
val_df = val_df[:10]
dev_df = dev_df[:10]

training_set = EMODataset(tokenizer=tokenizer, essay=train_df.essay, targets=y_train)
val_set = EMODataset(tokenizer=tokenizer, essay=val_df.essay, targets=y_val)
dev_set = EMODataset(tokenizer=tokenizer, essay=dev_df.essay, targets=y_dev)

def model_init():
  return BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS, problem_type="multi_label_classification")

# set CUDA if available

if torch.cuda.is_available():
    device = torch.device('cuda')
    print("======= CUDA Available =======")
else:
    device = torch.device('cpu')
    print("======= CUDA NOT Available, run on CPU =======")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
# al trainer si possono passar modelli customized?
# https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# https://www.tensorflow.org/text/tutorials/classify_text_with_bert

# LOSS CURVE TODO
# trainer.state.log_history
#plot_loss_curve(training_loss, validatin_loss, path, title)


from transformers import TrainingArguments, Trainer
from transformers import TrainerCallback, EarlyStoppingCallback
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import datasets

# --- TODO: come farlo meglio? ---
def preprocess_function(examples):
  return tokenizer(examples['text'], truncation=True)
# --------------------------------

train_arguments = TrainingArguments(
    output_dir="./",
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch", # saves a checkpoint at the end of each epoch
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
    seed=RANDOM_STATE
) # optim and many others


trainer = Trainer(
    model_init=model_init,
    args=train_arguments,
    train_dataset=training_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path

    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero: # whether this process is the main one in a distributed setting
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0))
trainer.add_callback(LoggingCallback("log.jsonl"))

trainer.train()
# results = trainer.evaluate()     # just gets evaluation metrics
results = trainer.predict(dev_set) # also gives you predictions

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.70031,0.377358,0.660735,0.0


In [12]:
trainer.state.log_history

[{'eval_loss': 0.7003101110458374,
  'eval_f1': 0.37735849056603776,
  'eval_roc_auc': 0.6607347876004592,
  'eval_accuracy': 0.0,
  'eval_runtime': 0.3795,
  'eval_samples_per_second': 26.348,
  'eval_steps_per_second': 7.904,
  'epoch': 1.0,
  'step': 5},
 {'train_runtime': 13.8411,
  'train_samples_per_second': 1.445,
  'train_steps_per_second': 0.361,
  'total_flos': 5262504591360.0,
  'train_loss': 0.7483242988586426,
  'epoch': 1.0,
  'step': 5}]

In [15]:
# LOSS CURVE TODO
# trainer.state.log_history
#plot_loss_curve(training_loss, validatin_loss, path, title)
import matplotlib.pyplot as plt

# Extract loss values
training_loss = [entry['train_loss'] for entry in trainer.state.log_history if 'train_loss' in entry]
validatin_loss = [entry['eval_loss'] for entry in trainer.state.log_history if 'train_loss' in entry]
path=""
title=""

plot_loss_curve(training_loss, validatin_loss, path, title)


In [10]:
# ---- GRID SEARCH ----
# https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb
# https://huggingface.co/docs/transformers/hpo_train

def sigopt_hp_space(trial):
    return [
        {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double"},
        {
            "categorical_values": ["16", "32"],
            "name": "per_device_train_batch_size",
            "type": "categorical",
        },
        {
            "categorical_values": ["16"],
            "name": "per_device_eval_batch_size",
            "type": "categorical",
        },
        {
            "categorical_values": ["1"],
            "name": "num_train_epochs",
            "type": "categorical",
        },
        {
            "categorical_values": ["epoch"],
            "name": "evaluation_strategy",
            "type": "categorical",
        },
        {
            "categorical_values": ["epoch"],
            "name": "save_strategy",
            "type": "categorical",
        },
        {
            "categorical_values": ["2e-5"],
            "name": "learning_rate",
            "type": "double",
        },
        {
            "categorical_values": ["0.01"],
            "name": "weight_decay",
            "type": "double",
        },
        {
            "categorical_values": ["True"],
            "name": "load_best_model_at_end",
            "type": "bool",
        },
        {
            "categorical_values": ["42"],
            "name": "seed",
            "type": "INT",
        },
    ]

best_trial = trainer.hyperparameter_search(
                            direction="maximize",
                            backend="sigopt",
                            hp_space=sigopt_hp_space,
                            n_trials=1,)
# ValueError: Must provide client_token or set environment variable SIGOPT_API_TOKEN

In [None]:
from utils import write_EMO_prediction, plot_loss_curve, compute_EMO_metrics, write_dict_to_json

predictions = encoded2string(predictions, mlb) # TODO: se predice tutti 0 --> Neutral
write_EMO_prediction(predictions, "predictions_EMO.tsv")
plot_loss_curve(training_stats["Training Loss"], training_stats["Validation Loss"], "loss_curve.png")
#scores = compute_EMO_metrics(...)
#write_dict_to_json(scores, "scores.json")