# Load Model and Test it Works

In [2]:
from transformers import pipeline
path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
classifier = pipeline("sentiment-analysis", model=path)  # window_size = 512
print(classifier(text))

[{'label': 'LABEL_1', 'score': 0.9994940757751465}]


# Load Data

In [3]:
from datasets import load_dataset
data = load_dataset("imdb", split="test")

# Tokenize Data

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True, padding=True)  # max_length = window size (tensor-512)

tokenized_imdb = data.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:08<00:00, 3035.28 examples/s]


# Evaluate Model According to Metrics
- Takes 60 mins to run so try on a smaller test set

In [5]:
import evaluate
from transformers import pipeline

path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"
pipe = pipeline("text-classification", model=path)

task_evaluator = evaluate.evaluator("text-classification")

eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    tokenizer=tokenizer,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
)

print(eval_results)

{'accuracy': 0.93148, 'recall': 0.93432, 'precision': 0.9290430355580304, 'f1': 0.9316740457101831, 'total_time_in_seconds': 3606.3575489830005, 'samples_per_second': 6.932202273468433, 'latency_in_seconds': 0.14425430195932}


# Evaluate Model According to Metrics on a Smaller Test Set

In [None]:
from datasets import load_dataset
data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))  # smaller test set size

# ------------------------------------- #

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True, padding=True)  # max_length = window size (tensor-512)

tokenized_imdb = data.map(preprocess_function, batched=True)

# ------------------------------------- #

import evaluate
from transformers import pipeline

path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"
pipe = pipeline("text-classification", model=path)

task_evaluator = evaluate.evaluator("text-classification")

eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    tokenizer=tokenizer,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
)

print(eval_results)

# Get Pred Labels
N.B: Relies on having access to the trainer function

In [17]:
# Get predicted labels for the test set
predictions = trainer.predict(tokenized_imdb["test"])

# Extract predicted labels from the predictions
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Print the first few predicted labels
print("Predicted labels:", predicted_labels[:5])

NameError: name 'trainer' is not defined

Bad pipe message: %s [b'\xe0\xc6s\xeb"i=\x83\xa6\x94\xc1e\r\x17o(\xd2\xd2 .\x1d$\xb6\x96\x1ew\xb9\xd4\xfd\x8b5\x98nw\xf1\xb7 $\x9c\x8fxw\xb0\xde\xf5\xd9 ]\x00\xacK\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00']
Bad pipe message: %s [b"\x06\xc8o@}\xbd\x05E_\xd6\x07\x02\xc6\xae\xf4\xa7}z\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# Define the path to the locally trained model
path = "/home/azureuser/cloudfiles/code/Users/Michael.Sowter/Deep_Learning_Training/Text Classifier/Models/hf_bert2"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path)

# Load the IMDb test set
imdb_test = load_dataset("imdb", split="test")

# Extract the texts from the IMDb test set
test_texts = imdb_test['text']

# Tokenize the test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Perform inference to get predictions
import torch
with torch.no_grad():
    input_ids = torch.tensor(test_encodings['input_ids'])
    attention_mask = torch.tensor(test_encodings['attention_mask'])
    outputs = model(input_ids, attention_mask=attention_mask)

# Get predicted labels from the logits
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# Convert labels to their corresponding class names
id2label = model.config.id2label
predicted_class_names = [id2label[label_id] for label_id in predicted_labels]

print("Predicted labels:", predicted_class_names)