In [36]:
import transformers
transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
from sklearn.metrics import confusion_matrix
from datasets import DatasetDict, load_dataset, load_metric
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def read_and_tokenize_data(filename):
    d = load_dataset('pandas', data_files=filename, split='train')
    d.remove_columns_(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

#### Reading data

In [41]:
train = read_and_tokenize_data("data/imdb_small_train.pkl")
validation = read_and_tokenize_data("data/imdb_small_validation.pkl")
test = read_and_tokenize_data("data/imdb_small_test.pkl")

Using custom data configuration default-172d097f51e98c89


Downloading and preparing dataset pandas/default to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-172d097f51e98c89/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-172d097f51e98c89/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-80a400844247d995


Downloading and preparing dataset pandas/default to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-80a400844247d995/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-80a400844247d995/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-cc4c7e97e54c8339


Downloading and preparing dataset pandas/default to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-cc4c7e97e54c8339/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset pandas downloaded and prepared to /Users/mgaulia/.cache/huggingface/datasets/pandas/default-cc4c7e97e54c8339/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?ba/s]

#### Training

In [43]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 2)

trainer = Trainer(
    model=model,
    train_dataset=train,
    eval_dataset=validation,
    tokenizer=tokenizer
)
trainer.train()

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /Users/mgaulia/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_m

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=36, training_loss=0.6975358857048882, metrics={'train_runtime': 1381.7433, 'train_samples_per_second': 0.195, 'train_steps_per_second': 0.026, 'total_flos': 71039984947200.0, 'train_loss': 0.6975358857048882, 'epoch': 3.0})

#### Saving model

In [44]:
trainer.save_model("models/sentiment_classification/model")
tokenizer.save_pretrained("models/sentiment_classification/tokenizer")

Saving model checkpoint to models/sentiment_classification/model
Configuration saved in models/sentiment_classification/model/config.json
Model weights saved in models/sentiment_classification/model/pytorch_model.bin
tokenizer config file saved in models/sentiment_classification/model/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/model/special_tokens_map.json
tokenizer config file saved in models/sentiment_classification/tokenizer/tokenizer_config.json
Special tokens file saved in models/sentiment_classification/tokenizer/special_tokens_map.json


('models/sentiment_classification/tokenizer/tokenizer_config.json',
 'models/sentiment_classification/tokenizer/special_tokens_map.json',
 'models/sentiment_classification/tokenizer/vocab.json',
 'models/sentiment_classification/tokenizer/merges.txt',
 'models/sentiment_classification/tokenizer/added_tokens.json')

#### Metrics

In [45]:
predictions = trainer.predict(test)
y_true = test["label"]
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(y_true, y_pred)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 5
  Batch size = 8


array([[1, 0],
       [4, 0]])

In [46]:
metric = load_metric("accuracy")
metric.compute(predictions=y_pred, references=y_true)

{'accuracy': 0.2}

#### Loading this model

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [23]:
model = AutoModelForSequenceClassification.from_pretrained("models/sentiment_classification/model", num_labels = 2)
tokenizer = AutoTokenizer.from_pretrained("models/sentiment_classification/tokenizer")
trainer = Trainer(model = model)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

loading configuration file models/sentiment_classification/model/config.json
Model config RobertaConfig {
  "_name_or_path": "models/sentiment_classification/model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.15.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file models/sentiment_classification/model/pytorch_model.bin
All model checkpoint weights were used when initializ

In [50]:
test_data = read_and_tokenize_data("data/imdb_small_test.pkl")
predictions = trainer.predict(test_data)
y_pred = np.argmax(predictions.predictions, axis=-1)
confusion_matrix(test_data["label"], y_pred)

Using custom data configuration default-cc4c7e97e54c8339
Reusing dataset pandas (/Users/mgaulia/.cache/huggingface/datasets/pandas/default-cc4c7e97e54c8339/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)
Loading cached processed dataset at /Users/mgaulia/.cache/huggingface/datasets/pandas/default-cc4c7e97e54c8339/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade/cache-99d5516dd4812d39.arrow
The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 5
  Batch size = 8


array([[1, 0],
       [4, 0]])