In [34]:
import pandas as pd
from transformers import RobertaTokenizer
from datasets import load_dataset
from transformers import Trainer
from transformers import RobertaForSequenceClassification
from datasets import DatasetDict, Dataset
from transformers import TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
from sklearn.metrics import confusion_matrix

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


In [3]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

#### Reading data

In [6]:
dataset = load_dataset('pandas', data_files="data4authors.pkl", split = "train")
dataset.remove_columns_(column_names = ['__index_level_0__'])

Using custom data configuration default-2a80170608bb83ad
Reusing dataset pandas (/scratch/lustre/home/maga6272/.cache/huggingface/datasets/pandas/default-2a80170608bb83ad/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)
  dataset.remove_columns_(column_names = ['__index_level_0__'])


In [8]:
num_labels = len(set(dataset["label"]))
print("Number of classes:", num_labels)

Number of classes: 4


#### Train - 90%, Validation - 5%, Test - 5%

In [9]:
dataset = dataset.shuffle(seed=404)
dataset.train_test_split(test_size=0.1)
train_testvalid = dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
all_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
all_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2691
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 150
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 149
    })
})

#### Tokenizing train, eval, test datasets

In [10]:
tokenized_datasets = all_dataset.map(tokenize_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

#### Training model

In [12]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = num_labels)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [13]:
training_args = TrainingArguments(
    output_dir='./results'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    tokenizer=tokenizer
)

In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2691
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1011


Step,Training Loss
500,0.5923
1000,0.1817


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1011, training_loss=0.38497441458772835, metrics={'train_runtime': 3912.1298, 'train_samples_per_second': 2.064, 'train_steps_per_second': 0.258, 'total_flos': 2124133692715008.0, 'train_loss': 0.38497441458772835, 'epoch': 3.0})

#### Saving model

In [15]:
trainer.save_model("trained_on_4_authors_model")

Saving model checkpoint to trained_on_4_authors_model
Configuration saved in trained_on_4_authors_model/config.json
Model weights saved in trained_on_4_authors_model/pytorch_model.bin
tokenizer config file saved in trained_on_4_authors_model/tokenizer_config.json
Special tokens file saved in trained_on_4_authors_model/special_tokens_map.json


#### Prediction and metrics

In [25]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 150
  Batch size = 8


(150, 4) (150,)


In [40]:
y_true = tokenized_datasets["test"]["label"]
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(y_true, y_pred)

array([[46,  1,  2,  3],
       [ 0, 16,  0,  2],
       [ 0,  0, 10,  1],
       [ 0,  1,  0, 68]])

In [44]:
metric = load_metric("accuracy")
metric.compute(predictions=y_pred, references=y_true)

{'accuracy': 0.9333333333333333}