In [5]:
import transformers
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, AdamW
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
print(torch.__version__)
print(transformers.__version__)

# hyperparameter
dropout_hidden = 0.2
nums_label = 2
lr = 1e-5
weight_decay = 1e-2
epochs = 2
batch_size = 16
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# file-path
data_path = "./data/sentiment/"
vocab_file = data_path + "vocab.txt"
train_data_path = data_path + "sentiment.train.data"  # 训练数据集
valid_data_path = data_path + "sentiment.valid.data"  # 验证数据集
print(train_data_path)

1.2.0
3.4.0
cuda:0
./data/sentiment/sentiment.train.data


In [6]:
train_dataset = pd.read_csv(train_data_path, sep='\t', names=["text", "label"])
valid_dataset = pd.read_csv(valid_data_path, sep='\t', names=["text", "label"])
print(train_dataset.loc[0,'text'])
train_dataset.head()
# print(valid_dataset.head())

贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~


Unnamed: 0,text,label
0,贝贝好爱干净 每天出门都要洗澡 还喜欢喝蒙牛 不喜欢蹲地方 喜欢坐凳子上还喜欢和我坐在一起~,1
1,感觉好像是文科生看一本《高等数学》的教材一样，流水账一般，只是背景很好罢了，选择在这样一个竞...,0
2,"很安静,隔音设施不错.服务员态度很好,下次还会选这里",1
3,1 感觉外观还可以，符合我的要求，体积虽不算小，但比它大的翻盖手机还是很多的。2 比一张IC...,1
4,收到后，包装完好。笔记本封条完好。 性价比很高，DVD驱动盘包含VISTA所有必备的驱动，方便。,1


In [7]:
# 定义DataSet来加载数据
class SentimentDataset(Dataset):
    def __init__(self, path):
        self.dataset = pd.read_csv(path, sep='\t', names=["text", "label"])
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        sample = {"text":text, "label":label}
        return sample

# 加载相关的训练集和验证集
sentiment_train_dataset = SentimentDataset(train_data_path)
sentiment_train_dataloader = DataLoader(sentiment_train_dataset, batch_size=batch_size, shuffle=True)

sentiment_valid_dataset = SentimentDataset(train_data_path)
sentiment_valid_dataloader = DataLoader(sentiment_valid_dataset, batch_size=batch_size, shuffle=True)

# 测试数据的封装
# for i_batch, batch_data in enumerate(sentiment_train_dataloader):
#     print(i_batch)  # 打印batch编号
#     print(batch_data["text"])  # 打印该batch里面src
#     print(batch_data["label"])  # 打印该batch里面trg


In [8]:
# 定义tokenizer，传入词汇表
tokenizer = BertTokenizer.from_pretrained("D://Desktop//机器学习//PTMs_finetuning//bert-base-chinese")
config = BertConfig.from_pretrained("D://Desktop//机器学习//PTMs_finetuning//bert-base-chinese", dropout_hidden=dropout_hidden)
print(tokenizer)
model = BertForSequenceClassification.from_pretrained("D://Desktop//机器学习//PTMs_finetuning//bert-base-chinese", config=config)
model.to(device)
print(model.config.id2label)


# 定义优化器和损失函数
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
criterion = nn.CrossEntropyLoss()

PreTrainedTokenizer(name_or_path='D://Desktop//机器学习//PTMs_finetuning//bert-base-chinese', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


Some weights of the model checkpoint at D://Desktop//机器学习//PTMs_finetuning//bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

{0: 'LABEL_0', 1: 'LABEL_1'}


In [9]:
# 定义训练函数
import time

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    start_time = time.time()
    for i, batch in enumerate(sentiment_train_dataloader):
        label = batch['label']
        text = batch['text']
        tokenizer_text = tokenizer(text, max_length=100, truncation=True, padding=True, return_tensors="pt")
        tokenizer_text.to(device)
        
        optimizer.zero_grad()
        output = model(**tokenizer_text, labels=label) # loss logits hidden_states attentions 
        pred_logit = output[1]
        pred_label = pred_logit.argmax(dim=1)
        
        # 计算loss
        # 这个 loss 和 output[0] 是一样的
        loss = criterion(pred_logit.view(-1, 2), label.view(-1))
        
        acc = (pred_label==label.view(-1)).sum().item()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        if (i+1)%10==0:
            print(f"batch:{i}", " | ","current loss:", epoch_loss / (i+1), " | ", "current acc:", epoch_acc / ((i+1)*len(label)), " | ", "per_batch/s:", (time.time()-start_time))
            start_time = time.time()
    return epoch_loss/len(dataloader), epoch_acc/len(dataloader.dataset.dataset)

def evaludate(model, iterator, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            label = batch["label"]
            text = batch["text"]
            tokenizer_text = tokenizer(text, max_length=100, add_special_tokens=True, truncation=True, padding=True, return_tensors="pt")
            output = model(**tokenizer_text, labels=label)
            pred_label = output[1].argmax(dim=1)
            loss = output[0]
            acc = (pred_label==label).sum().item()
            
            epoch_loss += loss
            epoch_acc += acc
            
            return epoch_loss/len(iterator), epoch_acc/(len(iterator.dataset.dataset))
print("ok!")

ok!


#### cpu test
![image.png](attachment:573ed178-d6d6-46cd-a80e-c90eede2edbf.png)

In [12]:
# 开始训练和验证
for i in range(epochs):
    train_loss, train_acc = train(model, sentiment_train_dataloader, optimizer, criterion, device)
    print("train loss: ", train_loss, " | ", "train acc:", train_acc)
    valid_loss, valid_acc = evaludate(model, sentiment_valid_dataloader, device)
    print("valid loss: ", valid_loss, " | ", "valid acc:", valid_acc)

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 2.00 GiB total capacity; 1.37 GiB already allocated; 12.54 MiB free; 99.50 MiB cached)