# 初始化环境

In [1]:
import os

os.environ['http_proxy'] = ''
os.environ['https_proxy'] = ''
os.environ['HF_HOME'] = '/root/onethingai-fs/models'
os.environ['HF_HUB_CACHE'] = '/root/onethingai-fs/models/hub'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

In [2]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML

# 数据集抽样
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [3]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [4]:
# 训练过程指标评估
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 文本分类模型训练

## 下载数据集

In [12]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

In [None]:
dataset

In [None]:
show_random_elements(dataset["train"])

## 预处理数据

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
show_random_elements(tokenized_datasets["train"], num_examples = 3)

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## 加载模型

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 配置训练参数

In [8]:
from transformers import TrainingArguments

model_dir = "/root/onethingai-fs/train/models/bert-base-cased-finetune-yelp"

# logging_steps 默认值为500，根据我们的训练数据和步长，将其设置为100
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", 
                                  per_device_train_batch_size=16,
                                  num_train_epochs=5,
                                  logging_steps=100)


In [9]:
print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_

## 开始训练

### 实例化Trainer

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", # epoch结束时，汇报评估指标
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=30)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

### 小数据集，训练3轮

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3372,1.130435,0.529
2,0.9325,0.963451,0.577
3,0.6701,0.959051,0.615


TrainOutput(global_step=189, training_loss=1.0179922517645297, metrics={'train_runtime': 58.2841, 'train_samples_per_second': 51.472, 'train_steps_per_second': 3.243, 'total_flos': 789354427392000.0, 'train_loss': 1.0179922517645297, 'epoch': 3.0})

In [23]:
trainer.evaluate(small_eval_dataset)

{'eval_loss': 0.9590511322021484,
 'eval_accuracy': 0.615,
 'eval_runtime': 5.078,
 'eval_samples_per_second': 196.929,
 'eval_steps_per_second': 24.616,
 'epoch': 3.0}

### 小数据集，训练5轮

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", # epoch结束时，汇报评估指标
                                  per_device_train_batch_size=16,
                                  num_train_epochs=5,
                                  logging_steps=100)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(small_eval_dataset)

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.189069,0.558
2,0.618400,1.115942,0.597
3,0.618400,1.461524,0.579
4,0.297400,1.546146,0.608
5,0.098200,1.717108,0.607


{'eval_loss': 1.7171076536178589,
 'eval_accuracy': 0.607,
 'eval_runtime': 5.0955,
 'eval_samples_per_second': 196.25,
 'eval_steps_per_second': 24.531,
 'epoch': 5.0}

### 全量数据训练

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", # epoch结束时，汇报评估指标
                                  per_device_train_batch_size=16,
                                  num_train_epochs=5,
                                  logging_steps=100,
                                  # save_strategy = 'epoch',
                                  save_total_limit = 5,
                                  # load_best_model_at_end = True,
                                  resume_from_checkpoint = True)

full_train_dataset = tokenized_datasets["train"].shuffle(seed=50).select(range(600000))
full_test_dataset = tokenized_datasets["train"].shuffle(seed=50).select(range(600000, 650000))

full_eval_dataset = tokenized_datasets["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)


In [21]:
trainer.train()

trainer.evaluate(full_eval_dataset)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7931,0.79575,0.65336
2,0.7482,0.754192,0.67406
3,0.66,0.761339,0.67854
4,0.5864,0.782943,0.68092
5,0.5327,0.853714,0.67784


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter serve

{'eval_loss': 0.8537139296531677,
 'eval_accuracy': 0.67784,
 'eval_runtime': 252.2139,
 'eval_samples_per_second': 198.244,
 'eval_steps_per_second': 24.781,
 'epoch': 5.0}

In [35]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [36]:
full_train_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 600000
})

In [37]:
full_test_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50000
})

In [38]:
full_eval_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50000
})