In [4]:
import transformers
from transformers import TrainingArguments
# transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
from sklearn.metrics import confusion_matrix
from datasets import DatasetDict, load_dataset, load_metric
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def read_and_tokenize_data(filename):
    d = load_dataset('pandas', data_files=filename, split='train')
    d.remove_columns_(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /scratch/lustre/home/vaga6213/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /scratch/lustre/home/vaga6213/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at /scratch/lustre/home/vaga6213/.cache/hugging

#### Reading data

In [6]:
train = read_and_tokenize_data("data/imdb_train.pkl")
validation = read_and_tokenize_data("data/imdb_validation.pkl")
test = read_and_tokenize_data("data/imdb_test.pkl")



  0%|          | 0/3 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

#### Training

In [7]:
args = TrainingArguments(evaluation_strategy = "steps", eval_steps = 50, output_dir="./results")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 2)

trainer = Trainer(
    args= args,
    model=model,
    train_dataset=train,
    eval_dataset=validation,
    tokenizer=tokenizer
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /scratch/lustre/home/vaga6213/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta

Step,Training Loss,Validation Loss
50,No log,0.542691
100,No log,0.35052
150,No log,0.309033
200,No log,0.28724
250,No log,0.506796
300,No log,0.34611
350,No log,0.382917
400,No log,0.337368
450,No log,0.467879
500,0.382600,0.384786


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSeque

TrainOutput(global_step=1014, training_loss=0.2737807894246818, metrics={'train_runtime': 311.0483, 'train_samples_per_second': 26.041, 'train_steps_per_second': 3.26, 'total_flos': 2131199548416000.0, 'train_loss': 0.2737807894246818, 'epoch': 3.0})

#### Saving model

In [10]:
trainer.save_model("models/sentiment_classification/model")
tokenizer.save_pretrained("models/sentiment_classification/tokenizer")

Saving model checkpoint to models/sentiment_classification/model
Configuration saved in models/sentiment_classification/model/config.json
Model weights saved in models/sentiment_classification/model/pytorch_model.bin
tokenizer config file saved in models/sentiment_classification/model/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/model/special_tokens_map.json
tokenizer config file saved in models/sentiment_classification/tokenizer/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/tokenizer/special_tokens_map.json


('models/sentiment_classification/tokenizer/tokenizer_config.json',
 'models/sentiment_classification/tokenizer/special_tokens_map.json',
 'models/sentiment_classification/tokenizer/vocab.json',
 'models/sentiment_classification/tokenizer/merges.txt',
 'models/sentiment_classification/tokenizer/added_tokens.json')

#### Metrics

In [11]:
predictions = trainer.predict(test)
y_true = test["label"]
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(y_true, y_pred)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 150
  Batch size = 8


array([[70,  7],
       [ 6, 67]])

In [12]:
metric = load_metric("accuracy")
metric.compute(predictions=y_pred, references=y_true)

{'accuracy': 0.9133333333333333}

#### Loading this model

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("models/sentiment_classification/model", num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained("models/sentiment_classification/tokenizer")
trainer = Trainer(model = model)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

loading configuration file models/sentiment_classification/model/config.json
Model config RobertaConfig {
  "_name_or_path": "models/sentiment_classification/model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file models/sentiment_classification/model/pytorch_model.bin
All model checkpoint weights were used when initializing R

In [15]:
test_data = read_and_tokenize_data("data/imdb_test.pkl")
predictions = trainer.predict(test_data)
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(test_data["label"], y_pred)



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 150
  Batch size = 8


array([[70,  7],
       [ 6, 67]])