In [1]:
import numpy as np
import transformers
from transformers import Trainer
transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset, load_metric
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [42]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

def read_and_tokenize_data(train,test,evaluate):
    d = load_dataset('pandas', data_files={"train":train, "test":test,"evaluate":evaluate})
    for i in d:
        d[i]= d[i].remove_columns(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\dovyd/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\dovyd/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\dovyd/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d7

#### Reading data

In [43]:
data_dict = read_and_tokenize_data("data/authors_train.pkl",\
                               "data/authors_test.pkl",\
                                   "data/authors_validation.pkl")

data_dict_2 = read_and_tokenize_data("data/imdb_train.pkl",\
                               "data/imdb_test.pkl",\
                                   "data/imdb_validation.pkl")

Using custom data configuration default-5d8442b82fe9acdc


Downloading and preparing dataset pandas/default to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-5d8442b82fe9acdc\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-5d8442b82fe9acdc\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-d0f7ab1575e159e4


Downloading and preparing dataset pandas/default to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-d0f7ab1575e159e4\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-d0f7ab1575e159e4\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

#### Training model

In [4]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 4)

model_2 = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 2)

In [5]:
training_args = TrainingArguments(
    output_dir='results',
    learning_rate=2e-5,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps = 250,
    save_steps = 250,
    load_best_model_at_end = True
)


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["evaluate"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer_2 = Trainer(
    model=model_2,
    args = training_args,
    train_dataset=data_dict_2["train"],
    eval_dataset=data_dict_2["evaluate"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [6]:
trainer_2.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 2700
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6760
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-250
Configuration saved in results/checkpoint-250/config.json


{'eval_loss': 0.42121168971061707, 'eval_accuracy': 0.88, 'eval_runtime': 1.4986, 'eval_samples_per_second': 100.092, 'eval_steps_per_second': 12.678, 'epoch': 0.74}


Model weights saved in results/checkpoint-250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-250/tokenizer_config.json
Special tokens file saved in results/checkpoint-250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.3439, 'learning_rate': 1.85207100591716e-05, 'epoch': 1.48}


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json


{'eval_loss': 0.47201740741729736, 'eval_accuracy': 0.9, 'eval_runtime': 1.4311, 'eval_samples_per_second': 104.816, 'eval_steps_per_second': 13.277, 'epoch': 1.48}


Model weights saved in results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-500/tokenizer_config.json
Special tokens file saved in results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-750
Configuration saved in results/checkpoint-750/config.json


{'eval_loss': 0.445279598236084, 'eval_accuracy': 0.9266666666666666, 'eval_runtime': 1.4328, 'eval_samples_per_second': 104.693, 'eval_steps_per_second': 13.261, 'epoch': 2.22}


Model weights saved in results/checkpoint-750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-750/tokenizer_config.json
Special tokens file saved in results/checkpoint-750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.1618, 'learning_rate': 1.70414201183432e-05, 'epoch': 2.96}


Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json


{'eval_loss': 0.44325175881385803, 'eval_accuracy': 0.9133333333333333, 'eval_runtime': 1.4341, 'eval_samples_per_second': 104.598, 'eval_steps_per_second': 13.249, 'epoch': 2.96}


Model weights saved in results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in results/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-1250
Configuration saved in results/checkpoint-1250/config.json


{'eval_loss': 0.4769594967365265, 'eval_accuracy': 0.92, 'eval_runtime': 1.4325, 'eval_samples_per_second': 104.711, 'eval_steps_per_second': 13.263, 'epoch': 3.7}


Model weights saved in results/checkpoint-1250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1250/tokenizer_config.json
Special tokens file saved in results/checkpoint-1250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.06, 'learning_rate': 1.5562130177514792e-05, 'epoch': 4.44}


Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json


{'eval_loss': 0.394641250371933, 'eval_accuracy': 0.9266666666666666, 'eval_runtime': 1.4295, 'eval_samples_per_second': 104.932, 'eval_steps_per_second': 13.291, 'epoch': 4.44}


Model weights saved in results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in results/checkpoint-1500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-1750
Configuration saved in results/checkpoint-1750/config.json


{'eval_loss': 0.7077649831771851, 'eval_accuracy': 0.9066666666666666, 'eval_runtime': 1.4294, 'eval_samples_per_second': 104.938, 'eval_steps_per_second': 13.292, 'epoch': 5.18}


Model weights saved in results/checkpoint-1750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-1750/tokenizer_config.json
Special tokens file saved in results/checkpoint-1750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0274, 'learning_rate': 1.4082840236686392e-05, 'epoch': 5.92}


Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json


{'eval_loss': 0.6947169899940491, 'eval_accuracy': 0.9066666666666666, 'eval_runtime': 1.4331, 'eval_samples_per_second': 104.665, 'eval_steps_per_second': 13.258, 'epoch': 5.92}


Model weights saved in results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in results/checkpoint-2000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-2250
Configuration saved in results/checkpoint-2250/config.json


{'eval_loss': 0.8670269846916199, 'eval_accuracy': 0.8866666666666667, 'eval_runtime': 1.4325, 'eval_samples_per_second': 104.715, 'eval_steps_per_second': 13.264, 'epoch': 6.66}


Model weights saved in results/checkpoint-2250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-2250/tokenizer_config.json
Special tokens file saved in results/checkpoint-2250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0369, 'learning_rate': 1.2603550295857989e-05, 'epoch': 7.4}


Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json


{'eval_loss': 0.7064142227172852, 'eval_accuracy': 0.9, 'eval_runtime': 1.4329, 'eval_samples_per_second': 104.684, 'eval_steps_per_second': 13.26, 'epoch': 7.4}


Model weights saved in results/checkpoint-2500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-2500/tokenizer_config.json
Special tokens file saved in results/checkpoint-2500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-2750
Configuration saved in results/checkpoint-2750/config.json


{'eval_loss': 0.7833418846130371, 'eval_accuracy': 0.92, 'eval_runtime': 1.4336, 'eval_samples_per_second': 104.635, 'eval_steps_per_second': 13.254, 'epoch': 8.14}


Model weights saved in results/checkpoint-2750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-2750/tokenizer_config.json
Special tokens file saved in results/checkpoint-2750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0096, 'learning_rate': 1.1124260355029586e-05, 'epoch': 8.88}


Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json


{'eval_loss': 0.7665197849273682, 'eval_accuracy': 0.9133333333333333, 'eval_runtime': 1.4314, 'eval_samples_per_second': 104.791, 'eval_steps_per_second': 13.274, 'epoch': 8.88}


Model weights saved in results/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3000/tokenizer_config.json
Special tokens file saved in results/checkpoint-3000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-3250
Configuration saved in results/checkpoint-3250/config.json


{'eval_loss': 0.7281720042228699, 'eval_accuracy': 0.9266666666666666, 'eval_runtime': 1.4519, 'eval_samples_per_second': 103.314, 'eval_steps_per_second': 13.086, 'epoch': 9.62}


Model weights saved in results/checkpoint-3250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3250/tokenizer_config.json
Special tokens file saved in results/checkpoint-3250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0172, 'learning_rate': 9.644970414201184e-06, 'epoch': 10.36}


Saving model checkpoint to results/checkpoint-3500
Configuration saved in results/checkpoint-3500/config.json


{'eval_loss': 0.8325244188308716, 'eval_accuracy': 0.9066666666666666, 'eval_runtime': 1.4317, 'eval_samples_per_second': 104.77, 'eval_steps_per_second': 13.271, 'epoch': 10.36}


Model weights saved in results/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3500/tokenizer_config.json
Special tokens file saved in results/checkpoint-3500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-3750
Configuration saved in results/checkpoint-3750/config.json


{'eval_loss': 0.8748603463172913, 'eval_accuracy': 0.8933333333333333, 'eval_runtime': 1.4383, 'eval_samples_per_second': 104.289, 'eval_steps_per_second': 13.21, 'epoch': 11.09}


Model weights saved in results/checkpoint-3750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-3750/tokenizer_config.json
Special tokens file saved in results/checkpoint-3750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0125, 'learning_rate': 8.165680473372781e-06, 'epoch': 11.83}


Saving model checkpoint to results/checkpoint-4000
Configuration saved in results/checkpoint-4000/config.json


{'eval_loss': 0.7383723855018616, 'eval_accuracy': 0.92, 'eval_runtime': 1.4346, 'eval_samples_per_second': 104.56, 'eval_steps_per_second': 13.244, 'epoch': 11.83}


Model weights saved in results/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-4000/tokenizer_config.json
Special tokens file saved in results/checkpoint-4000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-4250
Configuration saved in results/checkpoint-4250/config.json


{'eval_loss': 0.7885671854019165, 'eval_accuracy': 0.92, 'eval_runtime': 1.4379, 'eval_samples_per_second': 104.318, 'eval_steps_per_second': 13.214, 'epoch': 12.57}


Model weights saved in results/checkpoint-4250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-4250/tokenizer_config.json
Special tokens file saved in results/checkpoint-4250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0015, 'learning_rate': 6.686390532544379e-06, 'epoch': 13.31}


Saving model checkpoint to results/checkpoint-4500
Configuration saved in results/checkpoint-4500/config.json


{'eval_loss': 0.8066335320472717, 'eval_accuracy': 0.92, 'eval_runtime': 1.4326, 'eval_samples_per_second': 104.708, 'eval_steps_per_second': 13.263, 'epoch': 13.31}


Model weights saved in results/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-4500/tokenizer_config.json
Special tokens file saved in results/checkpoint-4500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-4750
Configuration saved in results/checkpoint-4750/config.json


{'eval_loss': 0.794413685798645, 'eval_accuracy': 0.9266666666666666, 'eval_runtime': 1.4352, 'eval_samples_per_second': 104.517, 'eval_steps_per_second': 13.239, 'epoch': 14.05}


Model weights saved in results/checkpoint-4750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-4750/tokenizer_config.json
Special tokens file saved in results/checkpoint-4750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0012, 'learning_rate': 5.207100591715976e-06, 'epoch': 14.79}


Saving model checkpoint to results/checkpoint-5000
Configuration saved in results/checkpoint-5000/config.json


{'eval_loss': 0.8257233500480652, 'eval_accuracy': 0.92, 'eval_runtime': 1.4376, 'eval_samples_per_second': 104.342, 'eval_steps_per_second': 13.217, 'epoch': 14.79}


Model weights saved in results/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-5000/tokenizer_config.json
Special tokens file saved in results/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-5250
Configuration saved in results/checkpoint-5250/config.json


{'eval_loss': 0.9668459892272949, 'eval_accuracy': 0.9066666666666666, 'eval_runtime': 1.4353, 'eval_samples_per_second': 104.506, 'eval_steps_per_second': 13.237, 'epoch': 15.53}


Model weights saved in results/checkpoint-5250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-5250/tokenizer_config.json
Special tokens file saved in results/checkpoint-5250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0064, 'learning_rate': 3.7278106508875745e-06, 'epoch': 16.27}


Saving model checkpoint to results/checkpoint-5500
Configuration saved in results/checkpoint-5500/config.json


{'eval_loss': 0.8655685186386108, 'eval_accuracy': 0.9133333333333333, 'eval_runtime': 1.4376, 'eval_samples_per_second': 104.339, 'eval_steps_per_second': 13.216, 'epoch': 16.27}


Model weights saved in results/checkpoint-5500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-5500/tokenizer_config.json
Special tokens file saved in results/checkpoint-5500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-5750
Configuration saved in results/checkpoint-5750/config.json


{'eval_loss': 0.8817095160484314, 'eval_accuracy': 0.9133333333333333, 'eval_runtime': 1.4366, 'eval_samples_per_second': 104.413, 'eval_steps_per_second': 13.226, 'epoch': 17.01}


Model weights saved in results/checkpoint-5750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-5750/tokenizer_config.json
Special tokens file saved in results/checkpoint-5750/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0, 'learning_rate': 2.2485207100591717e-06, 'epoch': 17.75}


Saving model checkpoint to results/checkpoint-6000
Configuration saved in results/checkpoint-6000/config.json


{'eval_loss': 0.8837414383888245, 'eval_accuracy': 0.9133333333333333, 'eval_runtime': 1.4363, 'eval_samples_per_second': 104.438, 'eval_steps_per_second': 13.229, 'epoch': 17.75}


Model weights saved in results/checkpoint-6000/pytorch_model.bin
tokenizer config file saved in results/checkpoint-6000/tokenizer_config.json
Special tokens file saved in results/checkpoint-6000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-6250
Configuration saved in results/checkpoint-6250/config.json


{'eval_loss': 0.8647990822792053, 'eval_accuracy': 0.92, 'eval_runtime': 1.4368, 'eval_samples_per_second': 104.401, 'eval_steps_per_second': 13.224, 'epoch': 18.49}


Model weights saved in results/checkpoint-6250/pytorch_model.bin
tokenizer config file saved in results/checkpoint-6250/tokenizer_config.json
Special tokens file saved in results/checkpoint-6250/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8


{'loss': 0.0017, 'learning_rate': 7.692307692307694e-07, 'epoch': 19.23}


Saving model checkpoint to results/checkpoint-6500
Configuration saved in results/checkpoint-6500/config.json


{'eval_loss': 0.8549203276634216, 'eval_accuracy': 0.92, 'eval_runtime': 1.4383, 'eval_samples_per_second': 104.292, 'eval_steps_per_second': 13.21, 'epoch': 19.23}


Model weights saved in results/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in results/checkpoint-6500/tokenizer_config.json
Special tokens file saved in results/checkpoint-6500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 150
  Batch size = 8
Saving model checkpoint to results/checkpoint-6750
Configuration saved in results/checkpoint-6750/config.json


{'eval_loss': 0.8569173216819763, 'eval_accuracy': 0.92, 'eval_runtime': 1.4369, 'eval_samples_per_second': 104.39, 'eval_steps_per_second': 13.223, 'epoch': 19.97}


Model weights saved in results/checkpoint-6750/pytorch_model.bin
tokenizer config file saved in results/checkpoint-6750/tokenizer_config.json
Special tokens file saved in results/checkpoint-6750/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-1500 (score: 0.394641250371933).


{'train_runtime': 1842.6312, 'train_samples_per_second': 29.306, 'train_steps_per_second': 3.669, 'train_loss': 0.05029036337585525, 'epoch': 20.0}


TrainOutput(global_step=6760, training_loss=0.05029036337585525, metrics={'train_runtime': 1842.6312, 'train_samples_per_second': 29.306, 'train_steps_per_second': 3.669, 'train_loss': 0.05029036337585525, 'epoch': 20.0})

#### Prediction and metrics

In [48]:
def make_predictions(trainer,test):
    predictions = trainer.predict(test)
    y_true = test["label"]
    y_pred = np.argmax(predictions.predictions, axis=-1)
    confusion_matrix(y_true, y_pred)
    metric = load_metric("accuracy")
    metric.compute(predictions=y_pred, references=y_true)

#### Saving model

In [7]:
def save_model(trainer,tokenizer,trainer_path,tokenizer_path):
    trainer.save_model(trainer_path)
    tokenizer.save_pretrained(tokenizer_path)
    
save_model(trainer,tokenizer,"models/author_classification/model","models/author_classification/tokenizer")
save_model(trainer_2,tokenizer,"models/sentiment_classification/model","models/sentiment_classification/tokenizer")

Saving model checkpoint to models/sentiment_classification/model
Configuration saved in models/sentiment_classification/model/config.json
Model weights saved in models/sentiment_classification/model/pytorch_model.bin
tokenizer config file saved in models/sentiment_classification/model/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/model/special_tokens_map.json
tokenizer config file saved in models/sentiment_classification/tokenizer/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/tokenizer/special_tokens_map.json


#### Loading this model

In [6]:
def load_model(model_path,tokenizer_path,num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
    trainer = Trainer(model = model)
    return (model, trainer, tokenizer)
    

model, trainer, tokenizer = load_model("models/author_classification/model","models/author_classification/tokenizer",4)

model_2, trainer_2, _ = load_model("models/sentiment_classification/model","models/sentiment_classification/tokenizer",2)

loading configuration file models/sentiment_classification/model\config.json
Model config RobertaConfig {
  "_name_or_path": "models/sentiment_classification/model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file models/sentiment_classification/model\pytorch_model.bin
All model checkpoint weights were used when initializing R

In [74]:
model.config.finetuning_task = "Classify 19th century authors"
model.config.id2label = {0:"Author one", 1:"Author two", 2:"Author three", 3:"Author four"}


model_2.config.finetuning_task = "Classify sentiment from imdb reviews"
model_2.config.id2label = {0:"Negative", 1:"Positive"}

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def read_and_tokenize_data(data):
    d = load_dataset('pandas', data_files=data,split="train")
    d = d.map(tokenize_function, batched=True)
    return d


def make_predictions(trainer,data_path):
    test_data = read_and_tokenize_data(data_path)
    predictions = trainer.predict(test_data)
    y_true = test_data["label"]
    y_pred = np.argmax(predictions.predictions, axis=-1)
    print(confusion_matrix(test_data["label"], y_pred))
    metric = load_metric("accuracy")
    print(metric.compute(predictions=y_pred, references=y_true))



make_predictions(trainer,"data/authors_test.pkl")

make_predictions(trainer_2,"data/imdb_test.pkl")

Using custom data configuration default-ca238c53a2a39e26


Downloading and preparing dataset pandas/default to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-ca238c53a2a39e26\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-ca238c53a2a39e26\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text.
***** Running Prediction *****
  Num examples = 149
  Batch size = 8


[[48  0  0  0]
 [ 0 21  0  0]
 [ 1  0  5  0]
 [ 2  1  0 71]]
{'accuracy': 0.9731543624161074}


Using custom data configuration default-58145b16daa5f1bf


Downloading and preparing dataset pandas/default to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-58145b16daa5f1bf\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to C:\Users\dovyd\.cache\huggingface\datasets\pandas\default-58145b16daa5f1bf\0.0.0\6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, text.
***** Running Prediction *****
  Num examples = 150
  Batch size = 8


[[73  4]
 [ 9 64]]
{'accuracy': 0.9133333333333333}


In [11]:
from transformers import TextClassificationPipeline

In [95]:
pipeline = TextClassificationPipeline(model=model,tokenizer=tokenizer)

pipeline("was a great step to take this step of marriage there was a responsibility about it that was when one approached it so nearly supposing things should go wrong what if the market should take a turn to the bad and everything go all at once to the there he would be with a double expense on his hands for in those days he ceased to believe the stories of the extraordinary with which a married couple could live five days more and it would be too late to his steps thoughts of closing his store and running away came into his head and held sway there for hours at a time it would be but would it not in the end prove the part of wisdom ah yes he had been right when he said a man should have fifty thousand dollars and be forty years of age before he such a folly yo fa tt s i he told and her mother that evening that he had a violent headache as an excuse for going to his room and staying there neither of them believed this statement covered the whole truth and they commented upon the matter when he had left them it is not unusual for men to have queer at such a time as this said the wise elder lady i have heard of them being so ill that the wedding had to be postponed we must deal very gently with mr in the matter for it would be awkward to have anything happen now awkward echoed it would be outrageous don t you think if i went to his room and bathed his head it would be a good thing the mother did not agree with the idea she said it was best to leave him entirely alone for the present it would not do to act as if anything was suspected she did not know that at the very minute she was saying this was seriously")

[{'label': 'Arthur Conan Doyle', 'score': 0.9999176263809204}]

In [82]:
pipeline_2 = TextClassificationPipeline(model=model_2,tokenizer=tokenizer)

pipeline_2("This sucks!")

pipeline_2("An all time classic") # the result + percentage of confidence

[{'label': 'Positive', 'score': 0.998927891254425}]