In [3]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:8889"
os.environ["https_proxy"] = "http://127.0.0.1:8889"

## Step-1 Import

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


## Step-2 Load dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="./datasets/ChnSentiCorp_htl_all.csv", split="train[:100%]")
dataset = dataset.filter(lambda example: example['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step-3 Split dataset

In [6]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step-4 Data preprocessing

In [7]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_example = tokenizer(examples["review"], truncation=True, max_length=128)
    tokenized_example["label"] = examples["label"]
    return tokenized_example

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 21894.26 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 22938.25 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777
    })
})

## Step-5 Create Model

In [38]:
from torch.optim import AdamW 

model = AutoModelForSequenceClassification.from_pretrained("./download_models/hfl/rbt3")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./download_models/hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step-6 create evaluate function 

In [39]:
import evaluate

acc_metrics = evaluate.load("accuracy")
f1_metrics = evaluate.load("f1")

In [40]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metrics.compute(predictions=predictions, references=labels)
    f1 = f1_metrics.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step-7 Create TrainingArguments

In [41]:
train_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=1e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    warmup_ratio=0.2,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)
train_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_always_pu

## Step-8 Create Trainer

In [42]:
from transformers import DataCollatorWithPadding

trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["test"],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric,
                  )
                  

## Step-9 Train


In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4167,0.396978,0.827542,0.885274
2,0.2878,0.31108,0.8713,0.913194
3,0.2629,0.268339,0.882883,0.917348
4,0.2493,0.256159,0.889318,0.921676
5,0.2259,0.255836,0.891892,0.923775


TrainOutput(global_step=550, training_loss=0.32337785980918193, metrics={'train_runtime': 50.8347, 'train_samples_per_second': 687.326, 'train_steps_per_second': 10.819, 'total_flos': 586516556605440.0, 'train_loss': 0.32337785980918193, 'epoch': 5.0})

## Step-10 Model evaluation

In [44]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.2558358907699585,
 'eval_accuracy': 0.8918918918918919,
 'eval_f1': 0.9237749546279491,
 'eval_runtime': 0.4024,
 'eval_samples_per_second': 1930.823,
 'eval_steps_per_second': 17.395,
 'epoch': 5.0}

## Step-11 Model prediction

In [20]:
trainer.predict(tokenized_datasets["test"])

PredictionOutput(predictions=array([[-3.6362956 ,  4.388077  ],
       [-3.6758442 ,  4.1893888 ],
       [ 2.877014  , -2.6338015 ],
       ...,
       [ 0.18371171, -0.5618945 ],
       [ 2.5922453 , -2.2553546 ],
       [-3.6442547 ,  4.274722  ]], dtype=float32), label_ids=array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0

## Step-12 Save and load

In [13]:
model.save_pretrained("./save_models/rbt3")

model = AutoModelForSequenceClassification.from_pretrained("./save_models/rbt3")