In [1]:
import numpy as np
import transformers
# transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset, load_metric
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def read_and_tokenize_data(filename):
    d = load_dataset('pandas', data_files=filename, split='train')
    d.remove_columns_(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

#### Reading data

In [3]:
train = read_and_tokenize_data("data/authors_train.pkl")
validation = read_and_tokenize_data("data/authors_validation.pkl")
test = read_and_tokenize_data("data/authors_test.pkl")

Using custom data configuration default-b6d93bc900946bdf
Reusing dataset pandas (/scratch/lustre/home/vaga6213/.cache/huggingface/datasets/pandas/default-b6d93bc900946bdf/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)
  d.remove_columns_(column_names = ['__index_level_0__'])
Loading cached processed dataset at /scratch/lustre/home/vaga6213/.cache/huggingface/datasets/pandas/default-b6d93bc900946bdf/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade/cache-13d44f20f0f0ad5d.arrow
Using custom data configuration default-f6ca686ffdd3f89e
Reusing dataset pandas (/scratch/lustre/home/vaga6213/.cache/huggingface/datasets/pandas/default-f6ca686ffdd3f89e/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-437f1f4ed24749a3
Reusing dataset pandas (/scratch/lustre/home/vaga6213/.cache/huggingface/datasets/pandas/default-437f1f4ed24749a3/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/1 [00:00<?, ?ba/s]

#### Training model

In [4]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 4)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [5]:
trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=validation,
    tokenizer=tokenizer
)

In [6]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2691
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1011


Step,Training Loss
500,0.9476
1000,0.5858


Saving model checkpoint to tmp_trainer/checkpoint-500
Configuration saved in tmp_trainer/checkpoint-500/config.json
Model weights saved in tmp_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-1000
Configuration saved in tmp_trainer/checkpoint-1000/config.json
Model weights saved in tmp_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1011, training_loss=0.7628828476963364, metrics={'train_runtime': 280.4773, 'train_samples_per_second': 28.783, 'train_steps_per_second': 3.605, 'total_flos': 2124133692715008.0, 'train_loss': 0.7628828476963364, 'epoch': 3.0})

#### Prediction and metrics

In [7]:
predictions = trainer.predict(test)
y_true = test["label"]
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(y_true, y_pred)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 149
  Batch size = 8


array([[46,  1,  1,  0],
       [ 1,  9,  1, 10],
       [ 1,  4,  1,  0],
       [ 2,  3,  1, 68]])

In [8]:
metric = load_metric("accuracy")
metric.compute(predictions=y_pred, references=y_true)

{'accuracy': 0.8322147651006712}

#### Saving model

In [9]:
trainer.save_model("models/author_classification/model")
tokenizer.save_pretrained("models/author_classification/tokenizer")

Saving model checkpoint to models/author_classification/model
Configuration saved in models/author_classification/model/config.json
Model weights saved in models/author_classification/model/pytorch_model.bin
tokenizer config file saved in models/author_classification/model/tokenizer_config.json
Special tokens file saved in models/author_classification/model/special_tokens_map.json
tokenizer config file saved in models/author_classification/tokenizer/tokenizer_config.json
Special tokens file saved in models/author_classification/tokenizer/special_tokens_map.json


('models/author_classification/tokenizer/tokenizer_config.json',
 'models/author_classification/tokenizer/special_tokens_map.json',
 'models/author_classification/tokenizer/vocab.json',
 'models/author_classification/tokenizer/merges.txt',
 'models/author_classification/tokenizer/added_tokens.json')

#### Loading this model

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("models/author_classification/model", num_labels = 4)
tokenizer = AutoTokenizer.from_pretrained("models/author_classification/tokenizer")
trainer = Trainer(model = model)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

loading configuration file models/author_classification/model/config.json
Model config RobertaConfig {
  "_name_or_path": "models/author_classification/model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size"

In [11]:
test_data = read_and_tokenize_data("data/authors_test.pkl")
predictions = trainer.predict(test_data)
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(test_data["label"], y_pred)

Using custom data configuration default-437f1f4ed24749a3
Reusing dataset pandas (/scratch/lustre/home/vaga6213/.cache/huggingface/datasets/pandas/default-437f1f4ed24749a3/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 149
  Batch size = 8


array([[46,  1,  1,  0],
       [ 1,  9,  1, 10],
       [ 1,  4,  1,  0],
       [ 2,  3,  1, 68]])

In [12]:
!zip -r models.zip modelforNLP/model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: zip: command not found
