In [1]:
#!pip install nlp==0.4.0
#!pip install transformers==4.3.0
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio===0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install -U scikit-learn

In [2]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np
import pandas as pd

In [3]:
from datasets import load_dataset
from transformers import LongformerForSequenceClassification, LongformerTokenizer

In [4]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096', max_length = 1024)

In [5]:
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', gradient_checkpointing=True,
                                                           attention_window = 512)


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

## Esempio di prova

In [6]:
#inputs = tokenizer("Hello, my dog is cute", return_tensors = "pt")
#labels = torch.tensor([1]).unsqueeze(0)

In [7]:
#outputs = model(**inputs, labels = labels)
#loss = outputs.loss
#logits = outputs.logits

In [8]:
#print("loss: {}".format(loss))
#print("logits: {}".format(logits))
#outputs

## Prova su IMDB

In [9]:
dataset = load_dataset("imdb")

Reusing dataset imdb (/home/matteo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [12]:
train_set = dataset['train']
test_set = dataset['test']

In [13]:
type(train_set)

datasets.arrow_dataset.Dataset

In [14]:
train_set = train_set.train_test_split(test_size=0.01)['test']

Loading cached split indices for dataset at /home/matteo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-158394280caf312f.arrow and /home/matteo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-78f07d55be971221.arrow


In [15]:
test_set = test_set.train_test_split(test_size=0.007)['test']

In [16]:
#test_set = test_set[0:300]
train_set 

Dataset({
    features: ['text', 'label'],
    num_rows: 250
})

In [17]:
test_set

Dataset({
    features: ['text', 'label'],
    num_rows: 175
})

In [18]:
type(test_set)

datasets.arrow_dataset.Dataset

In [19]:
def preProcessing(data):
    return tokenizer(data['text'],  padding = 'max_length', truncation=True, max_length = 1024)

In [20]:
# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [21]:
train_set = train_set.map(preProcessing, batched=True, batch_size=len(train_set))
test_set = test_set.map(preProcessing, batched=True, batch_size=len(test_set))

Loading cached processed dataset at /home/matteo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3/cache-5863a2a3f43d0749.arrow


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [22]:
type(train_set)

datasets.arrow_dataset.Dataset

In [23]:
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [24]:
type(train_set)


datasets.arrow_dataset.Dataset

In [25]:
from transformers import Trainer, TrainingArguments

In [26]:
# define the training arguments
training_args = TrainingArguments(
    output_dir = "./output",
    num_train_epochs = 5,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8,
    per_device_eval_batch_size= 16,
    evaluation_strategy = "epoch",
    disable_tqdm = False,
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    #logging_steps = 4,
    fp16 = True,
    logging_dir="./logs",
    dataloader_num_workers = 0,
    run_name = 'longformer-classification'
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_set,
    eval_dataset=test_set
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=20, training_loss=0.6806199073791503, metrics={'train_runtime': 595.4618, 'train_samples_per_second': 0.034, 'total_flos': 1141716433920000, 'epoch': 5.0})

In [29]:

history_val = trainer.evaluate()

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [31]:

import json

with open('./output/checkpoint-20/trainer_state.json') as f:
  data = json.load(f)

print(data['log_history'])

[{'epoch': 1.0, 'eval_accuracy': 0.45714285714285713, 'eval_f1': 0.6274509803921569, 'eval_loss': 0.7117172479629517, 'eval_precision': 0.45714285714285713, 'eval_recall': 1.0, 'eval_runtime': 15.9262, 'eval_samples_per_second': 10.988, 'step': 4}, {'epoch': 2.0, 'eval_accuracy': 0.45714285714285713, 'eval_f1': 0.6274509803921569, 'eval_loss': 0.7119091153144836, 'eval_precision': 0.45714285714285713, 'eval_recall': 1.0, 'eval_runtime': 15.898, 'eval_samples_per_second': 11.008, 'step': 8}, {'epoch': 3.0, 'eval_accuracy': 0.45714285714285713, 'eval_f1': 0.6274509803921569, 'eval_loss': 0.7111316919326782, 'eval_precision': 0.45714285714285713, 'eval_recall': 1.0, 'eval_runtime': 16.5634, 'eval_samples_per_second': 10.565, 'step': 12}, {'epoch': 4.0, 'eval_accuracy': 0.45714285714285713, 'eval_f1': 0.6274509803921569, 'eval_loss': 0.7109026312828064, 'eval_precision': 0.45714285714285713, 'eval_recall': 1.0, 'eval_runtime': 17.4033, 'eval_samples_per_second': 10.056, 'step': 16}, {'epoc

In [32]:
data = pd.DataFrame(data=data['log_history'])

In [33]:
data_loss = data[['epoch', 'loss']]

KeyError: "['loss'] not in index"

In [None]:
data_loss.dropna(inplace=True)

In [None]:
data.drop(index=np.arange(0,data.shape[0],2), inplace=True)

In [None]:
data.drop(columns=['loss', 'learning_rate'], inplace=True)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12,4.5), dpi = 100)
axs[0].plot(data.epoch, data.eval_accuracy,  marker='o', linestyle='-', label = "Accuracy")
axs[0].set_title("Accuracy-longformer-4096-IMDB", size = 22)
axs[0].set_xlabel('Epoche', size = 18)
axs[0].set_ylabel('Accuracy', size = 18)
axs[0].grid()
axs[0].legend()
axs[1].plot(data.epoch, data.eval_loss,  marker='x', linestyle='-', label = 'Loss-val')
axs[1].plot(data_loss.epoch, data_loss.loss,  marker='s', linestyle='-', label = 'Loss-train')
axs[1].set_title('Loss-longformer-4096-IMDB', size = 22)
axs[1].set_xlabel('Epoche', size = 18)
axs[1].set_ylabel('Loss', size = 18)
axs[1].grid()
axs[1].legend()
fig.tight_layout()
fig.savefig('../plots/longformer-classification/IMDB/esperimento_1.png')