In [1]:
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import os
import numpy as np
import evaluate
import torch
from transformers import EarlyStoppingCallback
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import os
os.chdir("/home/gokhan/hugging_face/bert_classification")
print(os.getcwd())

/home/gokhan/hugging_face/bert_classification


In [3]:
checkpoint = "distilbert-base-uncased" #or roberta-base
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
BATCH_SIZE = 16
EPOCHS = 15
GRAD_ACCUMULATION = 2 # 16 x 2 = 32 (Effective batch size as perceived by the model)
OUTPUT_DIR = "banking77_final_model"

In [5]:
dataset = load_dataset(
    'csv',
    data_files={
        "train": "final_train_dataset/banking77_train_final.csv", #after augmented
        "test": "final_train_dataset/banking77_test_final.csv"
    }
)

In [6]:
labels_list = dataset['train'].unique("category")
labels_list.sort()
num_labels = len(labels_list)
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}
print(f"label2id: {dict(list(label2id.items())[:5])}")
print(f"id2label: {dict(list(id2label.items())[:5])}")

label2id: {'Refund_not_showing_up': 0, 'activate_my_card': 1, 'age_limit': 2, 'apple_pay_or_google_pay': 3, 'atm_support': 4}
id2label: {0: 'Refund_not_showing_up', 1: 'activate_my_card', 2: 'age_limit', 3: 'apple_pay_or_google_pay', 4: 'atm_support'}


In [7]:
def encode_labels(batch):
    batch['labels'] = [label2id[c] for c in batch['category']]
    return batch

In [8]:
dataset = dataset.map(encode_labels, batched=True)

In [9]:
def tokenization(batch):
    return tokenizer(batch['text'],truncation=True, padding='max_length', max_length=128)

In [10]:
tokenized_datasets = dataset.map(tokenization, batched=True)

In [11]:
tokenized_datasets = tokenized_datasets.remove_columns(["text","category",])
tokenized_datasets.set_format('torch')

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels = num_labels,
    id2label=id2label,
    label2id=label2id
    )

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
accuracy_metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_preds):
    logits,labels = eval_preds
    predictions = np.argmax(logits,axis = 1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [15]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy= "epoch",
    save_strategy='epoch',
    learning_rate= 2e-5,
    per_device_train_batch_size=BATCH_SIZE, # 16
    gradient_accumulation_steps=GRAD_ACCUMULATION,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    fp16=True,
    dataloader_num_workers=4,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    logging_steps=50,
    report_to="none"
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,3.1437,2.596092,0.671329
2,1.631,1.300686,0.814186
3,0.8336,0.716656,0.883117
4,0.4595,0.486287,0.892108
5,0.2974,0.40538,0.898102
6,0.2,0.377806,0.903097
7,0.148,0.351828,0.908092
8,0.0942,0.355153,0.912088
9,0.0593,0.357103,0.911089
10,0.0465,0.356783,0.913087


TrainOutput(global_step=4350, training_loss=0.5522301031529219, metrics={'train_runtime': 730.4129, 'train_samples_per_second': 190.166, 'train_steps_per_second': 5.956, 'total_flos': 4606082910489600.0, 'train_loss': 0.5522301031529219, 'epoch': 15.0})

In [18]:
results = trainer.evaluate()

In [19]:
results

{'eval_loss': 0.37180572748184204,
 'eval_accuracy': 0.916083916083916,
 'eval_runtime': 1.6405,
 'eval_samples_per_second': 610.183,
 'eval_steps_per_second': 19.506,
 'epoch': 15.0}