In [8]:
import numpy as np
import evaluate
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import load_dataset
test_dataset = load_dataset(
    'csv',
    data_files={
        'test': "dataset/test.csv"
    })
test_dataset

DatasetDict({
    test: Dataset({
        features: ['text', 'category'],
        num_rows: 3080
    })
})

In [9]:
model_path = "banking77_final_model/checkpoint-4350"

In [10]:
test_dataset['test']['category']

Column(['card_arrival', 'card_arrival', 'card_arrival', 'card_arrival', 'card_arrival'])

In [12]:
labels = test_dataset['test'].unique("category")
labels.sort()
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
def preproccesing_func(examples):
    examples["label"] = [label2id[c] for c in examples["category"]]
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    return tokenized

In [30]:
test_data_encoded = test_dataset['test'].map(preproccesing_func, batched=True)
test_data_encoded.set_format('torch', columns=["input_ids", "attention_mask", "label"])

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(model_path)
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    
import os
os.environ["WANDB_DISABLED"] = "true"
trainer = Trainer(
    model=model,
    eval_dataset=test_data_encoded,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [32]:
results = trainer.evaluate()
print(f"Test Accuracy: %{results['eval_accuracy']*100:.2f}")
print(f"Test Loss : {results['eval_loss']:.4f}")

Test Accuracy: %92.63
Test Loss : 0.3124


In [33]:
test_sentences = [
    "I lost my card, please help!",              
    "Why is my transfer declined?",             
    "What is the exchange rate for Euro?",       
    "I want to change my pin code.",             
    "Can I get a virtual card?",                 
    "Where is the nearest ATM?"                  
]



for text in test_sentences:
    result = classifier(text)[0] # first result
    label = result['label']
    score = result['score']

    print(f"Text:  {text}")
    print(f"Prediction: {label}")
    print(f"Trust %{score*100:.2f}")
    print("-" * 30)

Text:  I lost my card, please help!
Prediction: lost_or_stolen_card
Trust %99.37
------------------------------
Text:  Why is my transfer declined?
Prediction: declined_transfer
Trust %99.52
------------------------------
Text:  What is the exchange rate for Euro?
Prediction: exchange_rate
Trust %97.12
------------------------------
Text:  I want to change my pin code.
Prediction: change_pin
Trust %99.35
------------------------------
Text:  Can I get a virtual card?
Prediction: getting_virtual_card
Trust %81.01
------------------------------
Text:  Where is the nearest ATM?
Prediction: atm_support
Trust %99.44
------------------------------
