In [1]:
import numpy as np
import pandas as pd
import torch
import csv
import transformers
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
import json
from torch.optim import AdamW
from transformers import AutoTokenizer,AutoModelForSequenceClassification, AutoConfig, get_linear_schedule_with_warmup
import warnings


In [2]:
warnings.filterwarnings('ignore')
device = 'cuda'
MODEL_NAME = "hfl/chinese-bert-wwm"
#MODEL_NAME = 'bert-base-chinese'
MAX_LEN = 32
EPOCHS = 5
BATCH_SIZE = 32 
LR = 5e-5 
WARMUP_STEP = 100

创建load_dataset function

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
def load_dataset(filepath, max_len):
    label = []
    sentences = []
    # load dataset
    f = open(filepath, 'r', encoding='utf-8')
    r = csv.reader(f)
    for item in r:
        if r.line_num == 1:
            continue
        label.append(int(item[0]))
        sentences.append(item[1])
        
    input_ids = []
    attention_masks = []

    # For every sentence...
    for data in sentences:
        encoded_data = tokenizer.encode_plus(
            text=data,                      # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,             # Max length to truncate/pad
            padding='max_length',           # Pad sentence to max length
            return_attention_mask=True,      # Return attention mask
            truncation= True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_data.get('input_ids'))
        attention_masks.append(encoded_data.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(label)
    return input_ids, attention_masks, labels


In [5]:
def batch_accuracy(pre, label):
    pre = pre.argmax(dim=1)
    correct = torch.eq(pre, label).sum().float().item()
    accuracy = correct / float(len(label))

    return accuracy

In [6]:
import os
path = os.path.abspath(os.path.dirname(os.getcwd()))

load dataset

In [7]:
train_dataset = load_dataset(f'{path}/data/db2/train.csv', max_len = MAX_LEN)
valid_dataset = load_dataset(f'{path}/data/db2/dev.csv', max_len = MAX_LEN)
test_dataset = load_dataset(f'{path}/data/db2/test.csv', max_len = MAX_LEN)

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_dataset[0], train_dataset[1],train_dataset[2])
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size = BATCH_SIZE)

val_data = TensorDataset(valid_dataset[0],valid_dataset[1],valid_dataset[2])
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data,sampler=val_sampler, batch_size = BATCH_SIZE)

test_data = TensorDataset(test_dataset[0],test_dataset[1],test_dataset[2])


In [9]:
torch.cuda.is_available()

True

In [10]:
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.cuda()
print()

Some weights of the model checkpoint at hfl/chinese-bert-wwm were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint




In [11]:
optimizer = AdamW(model.parameters(), lr=LR)
total_steps =len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=WARMUP_STEP,num_training_steps=total_steps)

In [12]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [13]:
model.train()
print('开始训练...')

for epoch in range(EPOCHS):
    loss_t, batch_loss, batch_acc, batch_counts = 0, 0, 0, 0
    
    model.train()
    for step,batch in enumerate(train_loader):
        batch_counts +=1
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        optimizer.zero_grad()
        
        loss_t += loss.item()
        batch_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        acc = batch_accuracy(logits, b_labels)
        batch_acc += acc
        
        if (step % 20 == 0 and step != 0) or (step == len(train_loader) - 1):
            print(f'epoch:{epoch} | step:{step} | avg_batch_acc:{batch_acc/batch_counts:^.6f} | avg_batch_loss:{batch_loss/batch_counts:^.6f}')
            batch_acc, batch_loss, batch_counts = 0, 0, 0
        
    avg_train_loss = loss_t / len(train_loader)
    
    #evaluate 
    val_acc, val_loss = [],[]
    model.eval()
    for batch in val_loader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        val_loss.append(loss.item())
        acc = batch_accuracy(logits, b_labels)
        val_acc.append(acc)
        
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_acc)
            
    print(f'epoch:{epoch} | avg_train_loss:{avg_train_loss} | val_loss:{val_loss} | val_accuracy:{val_accuracy}')
    
torch.save(model.state_dict(), 'bert_cla_db2.ckpt')
print('保存训练完成的model...')


开始训练...
epoch:0 | step:20 | avg_batch_acc:0.403274 | avg_batch_loss:1.096681
epoch:0 | step:40 | avg_batch_acc:0.650000 | avg_batch_loss:0.811226
epoch:0 | step:60 | avg_batch_acc:0.732812 | avg_batch_loss:0.665267
epoch:0 | step:80 | avg_batch_acc:0.826562 | avg_batch_loss:0.484480
epoch:0 | step:100 | avg_batch_acc:0.840625 | avg_batch_loss:0.429093
epoch:0 | step:120 | avg_batch_acc:0.851562 | avg_batch_loss:0.408485
epoch:0 | step:140 | avg_batch_acc:0.851562 | avg_batch_loss:0.374026
epoch:0 | step:160 | avg_batch_acc:0.856250 | avg_batch_loss:0.381271
epoch:0 | step:180 | avg_batch_acc:0.853125 | avg_batch_loss:0.413027
epoch:0 | step:200 | avg_batch_acc:0.878125 | avg_batch_loss:0.369436
epoch:0 | step:220 | avg_batch_acc:0.862500 | avg_batch_loss:0.392539
epoch:0 | step:240 | avg_batch_acc:0.848437 | avg_batch_loss:0.375778
epoch:0 | step:260 | avg_batch_acc:0.884375 | avg_batch_loss:0.319005
epoch:0 | step:280 | avg_batch_acc:0.851562 | avg_batch_loss:0.345227
epoch:0 | step:3

In [14]:
print('开始加载训练完成的model...')
model.load_state_dict(torch.load('bert_cla_db2.ckpt'))

开始加载训练完成的model...


<All keys matched successfully>

In [15]:
print('开始测试...')
model.eval()
test_result = []
for data in test_data:
    b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in data)
    b_input = b_input_ids.unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(b_input)
        pre = outputs.logits.argmax(dim=1)
        test_result.append([b_labels.item(), pre.item(), tokenizer.convert_ids_to_tokens(b_input_ids)])

# 写入csv文件
df = pd.DataFrame(test_result)
df.to_csv('test_result_db2.csv',index=False, header=['id', 'label','text'])

开始测试...


In [16]:
import pandas as pd
df = pd.read_csv('test_result_db2.csv')
df[df.id!=df.label]

Unnamed: 0,id,label,text
3,2,0,"['[CLS]', '我', '申', '请', '了', '大', '同', '飞', '..."
8,0,2,"['[CLS]', '没', '答', '应', '[UNK]', '[SEP]', '[P..."
12,2,1,"['[CLS]', '那', '你', '就', '不', '要', '出', '事', '..."
15,1,0,"['[CLS]', '免', '费', '领', '1g', '[SEP]', '[PAD]..."
24,2,0,"['[CLS]', '我', '没', '有', '这', '么', '多', '的', '..."
...,...,...,...
15948,1,0,"['[CLS]', '更', '多', '的', '优', '惠', '[SEP]', '[..."
15964,2,0,"['[CLS]', '你', '是', '谁', '不', '重', '要', '[SEP]..."
15966,2,0,"['[CLS]', '操', '你', '可', '以', '吗', '[SEP]', '[..."
15974,1,0,"['[CLS]', '真', '不', '容', '易', '勇', '敢', '勇', '..."


In [17]:
df

Unnamed: 0,id,label,text
0,1,1,"['[CLS]', '很', '惊', '叹', '不', '用', '谢', '谢', '..."
1,2,2,"['[CLS]', '你', '好', '我', '购', '票', '失', '败', '..."
2,0,0,"['[CLS]', '好', '多', '次', '这', '样', '了', '[SEP]..."
3,2,0,"['[CLS]', '我', '申', '请', '了', '大', '同', '飞', '..."
4,1,1,"['[CLS]', '这', '个', '有', '效', '的', '吧', '[SEP]..."
...,...,...,...
15995,0,0,"['[CLS]', '除', '了', '用', '身', '份', '证', '还', '..."
15996,1,1,"['[CLS]', '我', '到', '了', '亲', '爱', '的', '第', '..."
15997,0,0,"['[CLS]', '山', '家', '庭', '宽', '带', '20', '##m'..."
15998,0,0,"['[CLS]', '送', '票', '最', '起', '码', '应', '该', '..."


In [18]:
df[df.id!=df.label][df.label ==0]

Unnamed: 0,id,label,text
3,2,0,"['[CLS]', '我', '申', '请', '了', '大', '同', '飞', '..."
15,1,0,"['[CLS]', '免', '费', '领', '1g', '[SEP]', '[PAD]..."
24,2,0,"['[CLS]', '我', '没', '有', '这', '么', '多', '的', '..."
32,2,0,"['[CLS]', '不', '开', '了', '可', '以', '吧', '[SEP]..."
33,2,0,"['[CLS]', '一', '时', '可', '以', '查', '一', '时', '..."
...,...,...,...
15948,1,0,"['[CLS]', '更', '多', '的', '优', '惠', '[SEP]', '[..."
15964,2,0,"['[CLS]', '你', '是', '谁', '不', '重', '要', '[SEP]..."
15966,2,0,"['[CLS]', '操', '你', '可', '以', '吗', '[SEP]', '[..."
15974,1,0,"['[CLS]', '真', '不', '容', '易', '勇', '敢', '勇', '..."


In [19]:
len(df[df.id==df.label])/len(df)

0.8435