### Here I show another way to fine-tune BERT model for text classification tasks by using Trainer class.

In [1]:
import pandas as pd
import re
import spacy
import nltk
import torch
import string
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [2]:
df = pd.read_csv("../HomeWork1/nyt.csv")
print(df.shape)
df.head()

(11519, 2)


Unnamed: 0,text,label
0,(reuters) - carlos tevez sealed his move to ju...,sports
1,if professional pride and strong defiance can ...,sports
2,"palermo, sicily — roberta vinci beat top-seede...",sports
3,spain's big two soccer teams face a pair of it...,sports
4,the argentine soccer club san lorenzo complete...,sports


In [3]:
def label_to_number(label):
    mapping = {"business": 0, "politics": 1, "sports": 2}
    return mapping.get(label, -1)

df["label"] = df["label"].apply(label_to_number)

In [4]:
!python -m spacy download en_core_web_sm
sp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")

nltk_st = stopwords.words("english")

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to /home/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
def clean(text, http=True, punc=True, lem=True, stop_w=True):
    if http == True:
        text = re.sub("https?:\/\/t.co\/[A-Za-z0-9]*", "", text)
    if stop_w == True:
        text = [word for word in word_tokenize(text) if not word.lower() in nltk_st]
        text = " ".join(text)
    if lem == True:
        lemmatized = [word.lemma_ for word in sp(text)]
        text = " ".join(lemmatized)
    if punc == True:
        text = text.translate(str.maketrans("", "", string.punctuation))
        
    text = text.lower()
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"im", "i am", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'scuse", " excuse", text)
    text = re.sub("\W", " ", text)
    text = re.sub("\s+", " ", text)
    text = text.strip()
    
    return text

In [6]:
%time
DO_PREPROCESS = True
if DO_PREPROCESS:
    df["cleaned_text"] = df["text"].apply(lambda text: clean(text, http=True, punc=False, lem=False, stop_w=False))
else:
    df["cleaned_text"] = df["text"]
df.drop(columns=["text"], axis=1, inplace=True)
df.head()

CPU times: user 4 μs, sys: 0 ns, total: 4 μs
Wall time: 8.34 μs


Unnamed: 0,label,cleaned_text
0,2,reuters carlos tevez sealed his move to juvent...
1,2,if professional pride and strong defiance can ...
2,2,palermo sicily roberta vinci beat top seeded f...
3,2,spain big two soccer teams face a pair of ital...
4,2,the argentine soccer club san lorenzo complete...


In [7]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

In [8]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(batch):
    return tokenizer(batch["cleaned_text"], truncation=True, padding="max_length", max_length=64)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

print(train_dataset)



Map:   0%|          | 0/9215 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Map:   0%|          | 0/1152 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'cleaned_text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9215
})


In [9]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
label_weights = {0: 2.0, 1: 2.0, 2: 1.0}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    micro_f1 = f1_score(labels, preds, average="micro")
    return {"accuracy": acc, "macro_f1": macro_f1, "micro_f1": micro_f1}

In [11]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [
            {
                "input_ids": feature["input_ids"],
                "attention_mask": feature["attention_mask"],
                "labels": feature["label"]
            } for feature in features
        ]
        batch = tokenizer.pad(
            model_inputs,
            padding="max_length",
            max_length=64,
            return_tensors="pt",
            pad_to_multiple_of=4
        )
        return batch

In [12]:
class CustomTrainer(Trainer):
    def __init__(self, *args, label_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = CrossEntropyLoss(weight=torch.tensor([self.label_weights[i] for i in range(len(self.label_weights))], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [13]:
training_args = TrainingArguments(
    output_dir="output",
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
    learning_rate=4e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    remove_unused_columns=True,
    warmup_ratio=0.1,
    num_train_epochs=3,
    weight_decay=0.001,
    do_eval=True,
    eval_strategy="steps",
    eval_steps=100,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=100,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_only_model=True,
    lr_scheduler_type="cosine",
    report_to="none"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollator(),
    compute_metrics=compute_metrics,
    label_weights=label_weights
)

trainer.train()
trainer.evaluate()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Macro F1,Micro F1
100,1.0425,0.858955,0.750868,0.303258,0.750868
200,0.6306,0.396766,0.934028,0.86156,0.934028
300,0.2901,0.214662,0.961806,0.919505,0.961806
400,0.2022,0.162336,0.967014,0.928706,0.967014
500,0.1646,0.148316,0.965278,0.924192,0.965278
600,0.1401,0.143861,0.965278,0.924334,0.965278
700,0.1012,0.134551,0.96875,0.933357,0.96875
800,0.1035,0.128973,0.967882,0.932657,0.967882
900,0.104,0.129396,0.970486,0.937798,0.970486
1000,0.111,0.132859,0.96875,0.933123,0.96875


{'eval_loss': 0.13127782940864563,
 'eval_accuracy': 0.9713541666666666,
 'eval_macro_f1': 0.9409958113630976,
 'eval_micro_f1': 0.9713541666666666,
 'eval_runtime': 0.6785,
 'eval_samples_per_second': 1697.902,
 'eval_steps_per_second': 106.119,
 'epoch': 3.0}

In [14]:
predictions = trainer.predict(test_dataset).metrics
print(f"Accuracy Score: {predictions['test_accuracy']: .3f}")
print(f"Macro F1-score: {predictions['test_macro_f1']: .3f}")
print(f"Micro F1-score: {predictions['test_micro_f1']: .3f}")

Accuracy Score:  0.980
Macro F1-score:  0.954
Micro F1-score:  0.980


### We can find that the results got by using Trainer are slightly better than self-implement one.