In [1]:
import numpy as np
import pandas as pd
import torch
import csv
import transformers
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
import json
from torch.optim import AdamW
from transformers import AutoTokenizer,AutoModelForSequenceClassification, AutoConfig, get_linear_schedule_with_warmup
import warnings


In [2]:
warnings.filterwarnings('ignore')
device = 'cuda'
MODEL_NAME = "hfl/chinese-bert-wwm"
#MODEL_NAME = 'bert-base-chinese'
MAX_LEN = 32
EPOCHS = 5
BATCH_SIZE = 32 
LR = 5e-5 
WARMUP_STEP = 100

创建load_dataset function

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [4]:
def load_dataset(filepath, max_len):
    label = []
    sentences = []
    # load dataset
    f = open(filepath, 'r', encoding='utf-8')
    r = csv.reader(f)
    for item in r:
        if r.line_num == 1:
            continue
        label.append(int(item[0]))
        sentences.append(item[1])
        
    input_ids = []
    attention_masks = []

    # For every sentence...
    for data in sentences:
        encoded_data = tokenizer.encode_plus(
            text=data,                      # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,             # Max length to truncate/pad
            padding='max_length',           # Pad sentence to max length
            return_attention_mask=True,      # Return attention mask
            truncation= True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_data.get('input_ids'))
        attention_masks.append(encoded_data.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(label)
    return input_ids, attention_masks, labels


In [5]:
def batch_accuracy(pre, label):
    pre = pre.argmax(dim=1)
    correct = torch.eq(pre, label).sum().float().item()
    accuracy = correct / float(len(label))

    return accuracy

In [6]:
import os
path = os.path.abspath(os.path.dirname(os.getcwd()))

load dataset

In [7]:
train_dataset = load_dataset(f'{path}/data/train.csv', max_len = MAX_LEN)
valid_dataset = load_dataset(f'{path}/data/dev.csv', max_len = MAX_LEN)
test_dataset = load_dataset(f'{path}/data/test.csv', max_len = MAX_LEN)

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_dataset[0], train_dataset[1],train_dataset[2])
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size = BATCH_SIZE)

val_data = TensorDataset(valid_dataset[0],valid_dataset[1],valid_dataset[2])
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data,sampler=val_sampler, batch_size = BATCH_SIZE)

test_data = TensorDataset(test_dataset[0],test_dataset[1],test_dataset[2])


In [9]:
torch.cuda.is_available()

True

In [10]:
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.cuda()
print()

Some weights of the model checkpoint at hfl/chinese-bert-wwm were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint




In [11]:
optimizer = AdamW(model.parameters(), lr=LR)
total_steps =len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=WARMUP_STEP,num_training_steps=total_steps)

In [12]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [13]:
model.train()
print('开始训练...')

for epoch in range(EPOCHS):
    loss_t, batch_loss, batch_acc, batch_counts = 0, 0, 0, 0
    
    model.train()
    for step,batch in enumerate(train_loader):
        batch_counts +=1
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        optimizer.zero_grad()
        
        loss_t += loss.item()
        batch_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        acc = batch_accuracy(logits, b_labels)
        batch_acc += acc
        
        if (step % 20 == 0 and step != 0) or (step == len(train_loader) - 1):
            print(f'epoch:{epoch} | step:{step} | avg_batch_acc:{batch_acc/batch_counts:^.6f} | avg_batch_loss:{batch_loss/batch_counts:^.6f}')
            batch_acc, batch_loss, batch_counts = 0, 0, 0
        
    avg_train_loss = loss_t / len(train_loader)
    
    #evaluate 
    val_acc, val_loss = [],[]
    model.eval()
    for batch in val_loader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(b_input_ids, b_attn_mask, labels=b_labels)
        loss, logits = outputs[:2]
        val_loss.append(loss.item())
        acc = batch_accuracy(logits, b_labels)
        val_acc.append(acc)
        
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_acc)
            
    print(f'epoch:{epoch} | avg_train_loss:{avg_train_loss} | val_loss:{val_loss} | val_accuracy:{val_accuracy}')
    
torch.save(model.state_dict(), 'bert_cla.ckpt')
print('保存训练完成的model...')


开始训练...
epoch:0 | step:20 | avg_batch_acc:0.678571 | avg_batch_loss:0.876083
epoch:0 | step:40 | avg_batch_acc:0.745313 | avg_batch_loss:0.728562
epoch:0 | step:60 | avg_batch_acc:0.762500 | avg_batch_loss:0.646523
epoch:0 | step:80 | avg_batch_acc:0.753125 | avg_batch_loss:0.615368
epoch:0 | step:100 | avg_batch_acc:0.803125 | avg_batch_loss:0.518772
epoch:0 | step:120 | avg_batch_acc:0.820312 | avg_batch_loss:0.457180
epoch:0 | step:140 | avg_batch_acc:0.829688 | avg_batch_loss:0.474986
epoch:0 | step:160 | avg_batch_acc:0.814063 | avg_batch_loss:0.496113
epoch:0 | step:180 | avg_batch_acc:0.832812 | avg_batch_loss:0.441473
epoch:0 | step:200 | avg_batch_acc:0.790625 | avg_batch_loss:0.493925
epoch:0 | step:220 | avg_batch_acc:0.798438 | avg_batch_loss:0.480164
epoch:0 | step:240 | avg_batch_acc:0.834375 | avg_batch_loss:0.401562
epoch:0 | step:260 | avg_batch_acc:0.823438 | avg_batch_loss:0.444360
epoch:0 | step:280 | avg_batch_acc:0.832812 | avg_batch_loss:0.423479
epoch:0 | step:3

In [14]:
print('开始加载训练完成的model...')
model.load_state_dict(torch.load('bert_cla.ckpt'))

开始加载训练完成的model...


<All keys matched successfully>

In [15]:
print('开始测试...')
model.eval()
test_result = []
for data in test_data:
    b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in data)
    b_input = b_input_ids.unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(b_input)
        pre = outputs.logits.argmax(dim=1)
        test_result.append([b_labels.item(), pre.item(), tokenizer.convert_ids_to_tokens(b_input_ids)])

# 写入csv文件
df = pd.DataFrame(test_result)
df.to_csv('test_result.csv',index=False, header=['id', 'label','text'])

开始测试...


In [16]:
import pandas as pd
df = pd.read_csv('test_result.csv')
df[df.id!=df.label]

Unnamed: 0,id,label,text
3,2,0,"['[CLS]', '屁', '民', '也', '是', '民', '[SEP]', '[..."
4,1,0,"['[CLS]', '加', '油', '吧', '[SEP]', '[PAD]', '[P..."
14,2,0,"['[CLS]', '太', '扎', '眼', '了', '[SEP]', '[PAD]'..."
16,1,0,"['[CLS]', '领', '先', '[SEP]', '[PAD]', '[PAD]',..."
23,2,0,"['[CLS]', '打', '屎', '棍', '[SEP]', '[PAD]', '[P..."
...,...,...,...
6523,2,0,"['[CLS]', '成', '渣', '了', '[SEP]', '[PAD]', '[P..."
6524,1,0,"['[CLS]', '应', '该', '叫', '先', '知', '[SEP]', '[..."
6525,2,0,"['[CLS]', '3', '输', '了', '。', '[SEP]', '[PAD]'..."
6526,2,0,"['[CLS]', '就', '是', '太', '贵', '[SEP]', '[PAD]'..."


In [17]:
df

Unnamed: 0,id,label,text
0,0,0,"['[CLS]', '没', '玩', '过', '啊', '[SEP]', '[PAD]'..."
1,0,0,"['[CLS]', '停', '车', '减', '速', '[SEP]', '[PAD]'..."
2,0,0,"['[CLS]', '看', '不', '出', '来', '？', '[SEP]', '[..."
3,2,0,"['[CLS]', '屁', '民', '也', '是', '民', '[SEP]', '[..."
4,1,0,"['[CLS]', '加', '油', '吧', '[SEP]', '[PAD]', '[P..."
...,...,...,...
6530,0,0,"['[CLS]', '专', '赢', '阿', '森', '纳', '[SEP]', '[..."
6531,0,0,"['[CLS]', '你', '先', '说', '吧', '[SEP]', '[PAD]'..."
6532,0,0,"['[CLS]', '老', '哥', '吻', '乳', '沟', '[SEP]', '[..."
6533,0,0,"['[CLS]', '汽', '车', '吧', '[SEP]', '[PAD]', '[P..."


In [18]:
df[df.id!=df.label][df.label ==0]

Unnamed: 0,id,label,text
3,2,0,"['[CLS]', '屁', '民', '也', '是', '民', '[SEP]', '[..."
4,1,0,"['[CLS]', '加', '油', '吧', '[SEP]', '[PAD]', '[P..."
14,2,0,"['[CLS]', '太', '扎', '眼', '了', '[SEP]', '[PAD]'..."
16,1,0,"['[CLS]', '领', '先', '[SEP]', '[PAD]', '[PAD]',..."
23,2,0,"['[CLS]', '打', '屎', '棍', '[SEP]', '[PAD]', '[P..."
...,...,...,...
6521,2,0,"['[CLS]', '行', '情', '不', '好', '[SEP]', '[PAD]'..."
6523,2,0,"['[CLS]', '成', '渣', '了', '[SEP]', '[PAD]', '[P..."
6524,1,0,"['[CLS]', '应', '该', '叫', '先', '知', '[SEP]', '[..."
6525,2,0,"['[CLS]', '3', '输', '了', '。', '[SEP]', '[PAD]'..."


In [19]:
len(df[df.id==df.label])/len(df)

0.8171384850803366