In [1]:
# 设置Jupyter Notebook代理
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value


os.environ['HF_HOME'] = '/root/autodl-tmp/cache/huggingface/hub/'

# 数据处理

## 使用Datasets下载开源数据集

In [2]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full", cache_dir="/root/autodl-tmp/cache/huggingface/hub")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [4]:
dataset['train'][111]

{'label': 2,
 'text': "As far as Starbucks go, this is a pretty nice one.  The baristas are friendly and while I was here, a lot of regulars must have come in, because they bantered away with almost everyone.  The bathroom was clean and well maintained and the trash wasn't overflowing in the canisters around the store.  The pastries looked fresh, but I didn't partake.  The noise level was also at a nice working level - not too loud, music just barely audible.\\n\\nI do wish there was more seating.  It is nice that this location has a counter at the end of the bar for sole workers, but it doesn't replace more tables.  I'm sure this isn't as much of a problem in the summer when there's the space outside.\\n\\nThere was a treat receipt promo going on, but the barista didn't tell me about it, which I found odd.  Usually when they have promos like that going on, they ask everyone if they want their receipt to come back later in the day to claim whatever the offer is.  Today it was one of th

In [5]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML

In [6]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [7]:
show_random_elements(dataset["train"], 2)

Unnamed: 0,label,text
0,1 star,"Service was horrific. Don't expect courteous bar tenders. They must not be working for tips because they couldn't give an ef less about any thing other than playing their own video poker or having a smoke break. You're more of an inconvenience to them than a customer. I'm referring specifically to the older woman and her daughter. Once again, horrific service! Blatantly bad."
1,1 star,"You get what you pay for is the best way to describe this place. We made the mistake of waiting until Memorial Day Weekend to get our mani/pedis and EVERYTHING was booked. We tried to push them to their creative limits which were just not what we were looking for unfortunately. The women working were plenty nice I just prefer a little more.....quirk. \n\nIt's been a week and my nails are chipping and have the little \""crackle\"" on the tips. I do the gel mani's so they typically last me for anywhere from 3-4 weeks! I'm disappointed, but not surprised. I will forever travel to Pink Polish in Chandler for my favorite girls to do my mani/pedis! :)"


## 预处理数据

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [10]:
show_random_elements(tokenized_datasets["train"], num_examples=1)

Unnamed: 0,label,text,input_ids,token_type_ids,attention_mask
0,1 star,"Horrible!! Brought my iPad 2 in with a broken screen. I called ahead of time and was told it would be $100 but when I came in I was told it would be $130. I should have walked out the door then but I didn't. I came the next day to pick it up and it had a dent in the corner where they pryed the screen off. When I went to take the screen protector off the entire glass lifted from the device. So, I left it there and came back two days later. They tried to fix the dent but the iPad no longer fits in my case. The menu button was turned sideways and there are gaps all around the edges between the glass and device. And worst of all, they expected me to pay full price. I asked for a discount and was given 10%. Unbelievable, I hate making a scene so I just paid it. But I will be posting this review anywhere I can. For the same price you can get MUCH better service elsewhere. STAY AWAY!!!!","[101, 9800, 27788, 106, 106, 139, 14929, 1204, 1139, 178, 24300, 123, 1107, 1114, 170, 3088, 3251, 119, 146, 1270, 3075, 1104, 1159, 1105, 1108, 1500, 1122, 1156, 1129, 109, 1620, 1133, 1165, 146, 1338, 1107, 146, 1108, 1500, 1122, 1156, 1129, 109, 7029, 119, 146, 1431, 1138, 2045, 1149, 1103, 1442, 1173, 1133, 146, 1238, 112, 189, 119, 146, 1338, 1103, 1397, 1285, 1106, 3368, 1122, 1146, 1105, 1122, 1125, 170, 10552, 1204, 1107, 1103, 2655, 1187, 1152, 185, 1616, 1174, 1103, 3251, 1228, 119, 1332, 146, 1355, 1106, 1321, 1103, 3251, 23476, 1228, 1103, 2072, 2525, 3358, 1121, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...]"


## 数据抽样

In [11]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

total_train_dataset = tokenized_datasets["train"]
total_eval_dataset = tokenized_datasets["test"]

# 微调训练配置

## 加载 BERT 模型

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练超参数

In [13]:
from transformers import TrainingArguments

model_dir = "/root/autodl-tmp/models/bert-base-cased-finetune-yelp"

# logging_steps 默认值为500，根据我们的训练数据和步长，将其设置为100
training_args = TrainingArguments(output_dir=model_dir,
                                  per_device_train_batch_size=16,
                                  num_train_epochs=5,
                                  logging_steps=100)

In [14]:
# 完整的超参数配置
print(training_args)

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=

## 训练过程中的指标评估

In [15]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## 训练过程指标监控

In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch", 
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=30)

# 开始训练

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=total_train_dataset, # small_train_dataset
    eval_dataset=total_eval_dataset, # small_eval_dataset
    compute_metrics=compute_metrics
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
# trainer.train(resume_from_checkpoint=False)
trainer.train(resume_from_checkpoint=True) # 加上断点续训啊，朋友

Epoch,Training Loss,Validation Loss,Accuracy
2,0.6672,0.76156,0.6827
3,0.618,0.731767,0.69208


TrainOutput(global_step=121875, training_loss=0.20778199127979768, metrics={'train_runtime': 17201.3966, 'train_samples_per_second': 113.363, 'train_steps_per_second': 7.085, 'total_flos': 5.130803778048e+17, 'train_loss': 0.20778199127979768, 'epoch': 3.0})

In [20]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(100))
total_test_dataset = tokenized_datasets["test"]

In [21]:
trainer.evaluate(total_test_dataset) # small_test_dataset

{'eval_loss': 0.7317665815353394,
 'eval_accuracy': 0.69208,
 'eval_runtime': 467.1644,
 'eval_samples_per_second': 107.029,
 'eval_steps_per_second': 13.379,
 'epoch': 3.0}

## 保存模型和训练状态

In [22]:
trainer.save_model(model_dir)

In [23]:
trainer.save_state()