# To Try
- Try adjust hyperparams i.e. learning rate
- Clean data

In [8]:
from datasets import load_dataset
imdb = load_dataset("imdb")

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)  # max_length = window size (tensor-512)
tokenized_imdb = imdb.map(preprocess_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(data_collator)

import evaluate
accuracy = evaluate.load("accuracy")

import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# trainer.push_to_hub()



text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")  # window_size = 512
print(classifier(text))

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensor

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2229,0.218539,0.91868
2,0.1418,0.236004,0.9318


Bad pipe message: %s [b'\x90\x81%\x1d\xdb\x17_\xf5\xec\x93\xb6\xf9uHu\xe0T_ \xd5]U\x0b\xcb\x81\x985Uh\xa0\x83\x16\x9d?\xe1@YQ\x8bO\x96\xd8o\x03\xf0H\xe8\xeb\xa6\x00`\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05']
Bad pipe message: %s [b'\x04\x01\x05\x01\x06\x01\x00']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 IQ\xc6\xf9\x9c\xe1\xe4\xa9=\xac\x11\xa2\x96\xe6\xa2\x96\xaa\xba\\\xd0\xef1']
Bad pipe message: %s [b'Z\x19-\xa7\x8f\xe2\x88\xbc\xab\xf9\x11<t1\xc0%\xcbN \xa9\x8fO#\x04\x05\x84\xd0\x16\x11k\xde\xe7\xfc\xa8\xdbs<\x1fKv\x7f\xd2\xe2f\xc6\x81\x85N\\\xa0i\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x

[{'label': 'LABEL_1', 'score': 0.9994940757751465}]


In [9]:
# Get predicted labels for the test set
predictions = trainer.predict(tokenized_imdb["test"])

# Extract predicted labels from the predictions
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Print the first few predicted labels
print("Predicted labels:", predicted_labels[:5])

Predicted labels: [0 0 0 0 1]


In [10]:
predictions

PredictionOutput(predictions=array([[ 2.3222034e+00, -2.8073330e+00],
       [ 1.2798553e+00, -1.8725814e+00],
       [ 1.8573821e+00, -2.3344655e+00],
       ...,
       [ 5.1075470e-04, -3.5943294e-01],
       [ 4.5296308e-01, -8.8273329e-01],
       [-1.2667052e+00,  8.9116150e-01]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 1]), metrics={'test_loss': 0.21853885054588318, 'test_accuracy': 0.91868, 'test_runtime': 130.9367, 'test_samples_per_second': 190.932, 'test_steps_per_second': 11.937})

In [31]:
path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"
classifier.save_pretrained(path)  # save pipeline NOT model

In [5]:
from transformers import pipeline
path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
classifier = pipeline("sentiment-analysis", model=path)  # window_size = 512
print(classifier(text))

  from .autonotebook import tqdm as notebook_tqdm
2024-05-07 13:33:53.445024: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[{'label': 'LABEL_1', 'score': 0.9994940757751465}]
