  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
from transformers import BertModel, BertTokenizer

In [None]:
#bert模型文件下载地址 https://huggingface.co/bert-base-chinese

In [2]:
# 读取模型对应的tokenizer
tokenizer = BertTokenizer.from_pretrained(r'E:\code\bert-base-chinese')
# 载入模型
model = BertModel.from_pretrained(r'E:\code\bert-base-chinese')

In [3]:
# 输入文本
input_text = "我爱北京天安门"
# 通过tokenizer把文本变成 token_id
input_ids = tokenizer.encode(input_text, add_special_tokens=True)
print(input_ids)

[101, 2769, 4263, 1266, 776, 1921, 2128, 7305, 102]


In [4]:
input_ids = torch.tensor([input_ids])
# 获得BERT模型最后一个隐层结果
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
print(last_hidden_states.shape)

torch.Size([1, 9, 768])


In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from transformers import BertForSequenceClassification, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#1、加载数据
train_df = pd.read_csv('dadata.csv', encoding='utf-8', header=None, names=['label','review'])
print(train_df.shape)

sentences = list(train_df['review'][1:])
label =train_df['label'][1:].values

(1243, 2)


In [3]:
totallength = 0
for s in sentences:
    totallength += len(s)
print("avg length:", totallength / len(sentences))

avg length: 25.79790660225443


In [4]:
from collections import Counter
c = Counter(label)
print (dict(c))

{'0': 1120, '1': 122}


In [5]:
#2 token encodding
model_path = r'E:\code\bert-base-chinese'
tokenizer=BertTokenizer.from_pretrained(model_path)
max_length=32
#sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt')
sentences_tokened=tokenizer(sentences,padding=True,truncation=True,max_length=max_length, return_tensors='pt')
label=torch.tensor(label.astype(np.int64))

In [6]:
#3 encoding data
from torch.utils.data import Dataset,DataLoader,random_split

class DataToDataset(Dataset):
    def __init__(self,encoding,labels):
        self.encoding=encoding
        self.labels=labels
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self,index):
        return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index]

#封装数据
datasets=DataToDataset(sentences_tokened,label)
train_size=int(len(datasets)*0.8)
test_size=len(datasets)-train_size
print([train_size,test_size])
train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size])

[993, 249]


In [7]:
BATCH_SIZE=16
#这里的num_workers要大于0
train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=0)
val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=0)#

In [12]:
#4、create model
class BertTextClassficationModel(nn.Module):
    def __init__(self):
        super(BertTextClassficationModel,self).__init__()
        self.bert=BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
        
    def forward(self,ids,mask):
        out = self.bert(input_ids=ids,attention_mask=mask)
        return out[0]


mymodel=BertTextClassficationModel()


#获取gpu和cpu的设备信息
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device=",device)
if torch.cuda.device_count()>1:
    print("Let's use ",torch.cuda.device_count(),"GPUs!")
    mymodel=nn.DataParallel(mymodel)
mymodel.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at E:\code\bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device= cuda


BertTextClassficationModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=7

In [13]:
#5、train model
loss_func=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(mymodel.parameters(),lr=0.0001)

from sklearn.metrics import accuracy_score
def flat_accuracy(preds,labels):
    pred_flat=np.argmax(preds,axis=1).flatten()
    labels_flat=labels.flatten()
    return accuracy_score(labels_flat,pred_flat)

epochs=3
for epoch in range(epochs):
    train_loss = 0.0
    train_acc=0.0
    for i,data in enumerate(train_loader):
        input_ids,attention_mask,labels=[elem.to(device) for elem in data]
        #优化器置零
        optimizer.zero_grad()
        #得到模型的结果
        out=mymodel(input_ids.long(),attention_mask)
        #计算误差
        loss=loss_func(out,labels)
        train_loss += loss.item()
        #误差反向传播
        loss.backward()
        #更新模型参数
        optimizer.step()
        #计算acc 
        #out=out.detach().numpy()
        out=out.detach().cpu().numpy()
        #labels=labels.detach().numpy()
        labels=labels.detach().cpu().numpy()
        train_acc+=flat_accuracy(out,labels)
        if (i + 1) % 10 == 0:
                print("train %d/%d epochs Batch %d Loss:%f, Acc:%f" %(epoch+1,epochs, (i+1), train_loss/(i+1),train_acc/(i+1)))
    print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch+1,epochs,train_loss/(i+1),train_acc/(i+1)))

train 1/3 epochs Batch 10 Loss:0.280510, Acc:0.893750
train 1/3 epochs Batch 20 Loss:0.204388, Acc:0.921875
train 1/3 epochs Batch 30 Loss:0.188635, Acc:0.933333
train 1/3 epochs Batch 40 Loss:0.159935, Acc:0.946875
train 1/3 epochs Batch 50 Loss:0.147289, Acc:0.951250
train 1/3 epochs Batch 60 Loss:0.146746, Acc:0.948958
train 1/3 epochs Loss:0.141111, Acc:0.950397
train 2/3 epochs Batch 10 Loss:0.064249, Acc:0.987500
train 2/3 epochs Batch 20 Loss:0.038335, Acc:0.993750
train 2/3 epochs Batch 30 Loss:0.058966, Acc:0.985417
train 2/3 epochs Batch 40 Loss:0.071947, Acc:0.984375
train 2/3 epochs Batch 50 Loss:0.069812, Acc:0.985000
train 2/3 epochs Batch 60 Loss:0.071077, Acc:0.984375
train 2/3 epochs Loss:0.091210, Acc:0.979167
train 3/3 epochs Batch 10 Loss:0.223649, Acc:0.893750
train 3/3 epochs Batch 20 Loss:0.144503, Acc:0.943750
train 3/3 epochs Batch 30 Loss:0.114274, Acc:0.958333
train 3/3 epochs Batch 40 Loss:0.095458, Acc:0.967187
train 3/3 epochs Batch 50 Loss:0.079293, Acc:0

In [15]:
#6、evaluate
from sklearn import metrics

print("evaluate...")
pred_list = []
y_list = []
mymodel.eval()
for j,batch in enumerate(val_loader):
    val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch]
    with torch.no_grad():
        pred=mymodel(val_input_ids,val_attention_mask)
        pred=pred.detach().cpu().numpy()
        pred_flat=np.argmax(pred,axis=1).flatten()
        pred_list.extend(pred_flat)
        val_labels=val_labels.detach().cpu().numpy()
        y_list.extend(val_labels)

classify_report = metrics.classification_report(pred_list, y_list, digits=4) #分类报告 support测试集样本数
print(classify_report) 
confusion_matrix = metrics.confusion_matrix(pred_list, y_list) #混淆矩阵
print(confusion_matrix) 

evaluate...
              precision    recall  f1-score   support

           0     1.0000    0.9825    0.9912       228
           1     0.8400    1.0000    0.9130        21

    accuracy                         0.9839       249
   macro avg     0.9200    0.9912    0.9521       249
weighted avg     0.9865    0.9839    0.9846       249

[[224   4]
 [  0  21]]
