# Pretrained Model (BERT)

In [6]:
from datasets import load_dataset

!pip install -q -U transformers datasets accelerate evaluate

ds = load_dataset('thainq107/ntc-scv')

In [7]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"  # bert-base-uncased
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["preprocessed_sentence"],
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    result["label"] = examples['label']
    return result

# Running the preprocessing pipeline on all the datasets
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Running tokenizer on dataset: 100%|██████████| 30000/30000 [00:02<00:00, 11948.42 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:00<00:00, 12344.64 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:00<00:00, 12053.53 examples/s]


In [8]:
from transformers import AutoConfig, AutoModelForSequenceClassification

num_labels = 2

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    finetuning_task="text-classification"
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 4.49MB/s]


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="save_model",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()