In [1]:
import os
os.environ["http_proxy"] = "http://127.0.0.1:8889"
os.environ["https_proxy"] = "http://127.0.0.1:8889"

## Step-1 Import

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## Step-2 Load dataset

In [3]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="./datasets/ChnSentiCorp_htl_all.csv", split="train[:100%]")
dataset = dataset.filter(lambda example: example['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step-3 Split dataset

In [4]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step-4 Create DataLoader

In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_example = tokenizer(examples["review"], truncation=True, max_length=128)
    tokenized_example["label"] = examples["label"]
    return tokenized_example

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 23311.75 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 21493.65 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777
    })
})

In [6]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

train_set, valid_set = tokenized_datasets["train"], tokenized_datasets["test"]
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
valide_loader = DataLoader(valid_set, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))

In [7]:
next(enumerate(train_loader))[1]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101,  124, 3299,  ...,    0,    0,    0],
        [ 101, 2523, 3209,  ..., 2145, 8024,  102],
        [ 101, 2769, 3221,  ..., 7478, 2382,  102],
        ...,
        [ 101, 2945, 6432,  ..., 7946, 5682,  102],
        [ 101, 2347, 5307,  ...,  763,  511,  102],
        [ 101, 7357, 3191,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 1, 0])}

## Step-5 Create Model and Optimizer

In [8]:
from torch.optim import AdamW 

model = AutoModelForSequenceClassification.from_pretrained("./download_models/hfl/rbt3")
if torch.cuda.is_available():
    model = model.cuda()

optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./download_models/hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step-6 Train and Evaluate

In [9]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1"])

In [10]:
def evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in valide_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=1)
            clf_metrics.add_batch(predictions=pred.long(), references=batch["labels"].long())
    return clf_metrics.compute()
        
def train(epoch=5, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in train_loader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"epoch {ep} global step {global_step} loss {output.loss.item()}")
            global_step += 1
        
        clf = evaluate()
        print(f"epoch {ep}, {clf}")

## Step-7 Train

In [11]:
train()

epoch 0 global step 0 loss 0.8221923112869263
epoch 0 global step 100 loss 0.31136125326156616
epoch 0 global step 200 loss 0.3812849819660187
epoch 0, {'accuracy': 0.8815958815958816, 'f1': 0.9089108910891089}
epoch 1 global step 300 loss 0.20851585268974304
epoch 1 global step 400 loss 0.3075603246688843
epoch 1, {'accuracy': 0.8854568854568855, 'f1': 0.9107321965897694}
epoch 2 global step 500 loss 0.14233465492725372
epoch 2 global step 600 loss 0.15816275775432587
epoch 2, {'accuracy': 0.8944658944658944, 'f1': 0.9214559386973179}
epoch 3 global step 700 loss 0.15969210863113403
epoch 3 global step 800 loss 0.06431762874126434
epoch 3, {'accuracy': 0.8893178893178894, 'f1': 0.9166666666666666}
epoch 4 global step 900 loss 0.07480435073375702
epoch 4 global step 1000 loss 0.016924355179071426
epoch 4, {'accuracy': 0.8931788931788932, 'f1': 0.9198067632850243}



## Step-8 Test

In [12]:
sen = "这家酒店的环境非常好，价格也便宜，值得推荐"

id2label = {0: "neg", 1: "pos"}

with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    output = model(**inputs)
    pred = torch.argmax(output.logits, dim=-1)
    print(id2label[pred.item()])
    

pos


In [14]:
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

pipe(sen)

[{'label': 'pos', 'score': 0.9993993043899536}]

## Step-10 Save and load

In [13]:
model.save_pretrained("./save_models/rbt3")

model = AutoModelForSequenceClassification.from_pretrained("./save_models/rbt3")