In [1]:
import numpy as np
import pandas as pd
import torch
import csv
import transformers
import torch.nn as nn
import torch.utils.data as Data
import torch.nn.functional as F
import json
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, BertConfig, get_linear_schedule_with_warmup


In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
device = 'cuda'
#MODEL_NAME = "hfl/chinese-bert-wwm"
MODEL_NAME = 'bert-base-chinese'
MAX_LEN = 32
EPOCHS = 5
BATCH_SIZE = 32
LR = 5e-5 
WARMUP_STEPS = 100


创建dataset类

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

创建load_dataset function

In [5]:
def load_dataset(filepath, max_len):
    label = []
    sentences = []
    # load dataset
    f = open(filepath, 'r', encoding='utf-8')
    r = csv.reader(f)
    for item in r:
        if r.line_num == 1:
            continue
        label.append(int(item[0]))
        sentences.append(item[1])
        
    input_ids = []
    attention_masks = []

    # For every sentence...
    for data in sentences:
        encoded_data = tokenizer.encode_plus(
            text=data,                      # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,             # Max length to truncate/pad
            padding='max_length',           # Pad sentence to max length
            return_attention_mask=True,      # Return attention mask
            truncation= True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_data.get('input_ids'))
        attention_masks.append(encoded_data.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(label)
    return input_ids, attention_masks, labels


load dataset

In [6]:
import os
path = os.path.abspath(os.path.dirname(os.getcwd()))

In [7]:
train_dataset = load_dataset(f'{path}/data/train.csv', max_len = MAX_LEN)
valid_dataset = load_dataset(f'{path}/data/dev.csv', max_len = MAX_LEN)
test_dataset = load_dataset(f'{path}/data/test.csv', max_len = MAX_LEN)

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(train_dataset[0], train_dataset[1],train_dataset[2])
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size = BATCH_SIZE)

val_data = TensorDataset(valid_dataset[0],valid_dataset[1],valid_dataset[2])
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data,sampler=val_sampler, batch_size = BATCH_SIZE)

test_data = TensorDataset(test_dataset[0],test_dataset[1],test_dataset[2])

In [9]:
config = BertConfig.from_pretrained(MODEL_NAME)
config.hidden_size

768

In [10]:
%%time
import time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        
        embedding_dim = self.bert.config.to_dict()['hidden_size']
        self.dropout = nn.Dropout(dropout)
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,attention_mask=attention_mask)
        
        # Extract the last hidden state of
        encoded_layers = outputs[0]
        
        _, hidden = self.rnn(encoded_layers)

        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
            
        logits = self.out(hidden)
        return logits

CPU times: total: 46.9 ms
Wall time: 50 ms


In [11]:
def initialize_model(epochs=EPOCHS):
    HIDDEN_DIM = 256
    OUTPUT_DIM = 3
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.25
    
    bert_classifier = BertClassifier(HIDDEN_DIM,
                            OUTPUT_DIM,
                            N_LAYERS,
                            BIDIRECTIONAL,
                            DROPOUT)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),lr=LR)

    # Set up the learning rate scheduler
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=WARMUP_STEPS,num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [12]:
loss_fn = nn.CrossEntropyLoss()
def train(model, train_dataloader, val_dataloader=None, epochs=EPOCHS , evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for s,batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            logits = model(b_input_ids, b_attn_mask)
            # Compute loss and accumulate the loss values
            loss = loss_fn(logits.view(-1, 3), b_labels.view(-1))
            batch_loss += loss.item()
            total_loss += loss.item()
            
            # Zero out any previously calculated gradients
            model.zero_grad()
            
            # return loss, logits
            # Perform a backward pass to calculate gradients
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Print the loss values and time elapsed for every 20 batches
            if (s % 20 == 0 and s != 0) or (s == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {s:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch 
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")

def evaluate(model, val_dataloader):
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [13]:
bert_classifier, optimizer, scheduler = initialize_model(epochs=EPOCHS)
train(bert_classifier, train_loader, val_loader, epochs=EPOCHS, evaluation=True)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.894581   |     -      |     -     |   6.74   
   1    |   40    |   0.684366   |     -      |     -     |   4.86   
   1    |   60    |   0.646497   |     -      |     -     |   4.80   
   1    |   80    |   0.545208   |     -      |     -     |   4.80   
   1    |   100   |   0.535401   |     -      |     -     |   4.85   
   1    |   120   |   0.482316   |     -      |     -     |   4.87   
   1    |   140   |   0.448800   |     -      |     -     |   4.87   
   1    |   160   |   0.435579   |     -      |     -     |   4.87   
   1    |   180   |   0.479886   |     -      |     -     |   4.88   
   1    |   200   |   0.391496   |     -      |     -     |   4.88   
   1    |   220   |   0.429770   |     -      |     -     |   4.89   
   1    |   240   |   0.437242   |     -      |     -     |   4.89   


In [14]:
torch.save(bert_classifier.state_dict(), 'bert_cla.ckpt')

In [None]:
compare linear and gru

In [15]:
test_data = TensorDataset(test_dataset[0],test_dataset[1],test_dataset[2])

In [16]:
print('开始测试...')
bert_classifier.eval()
test_result = []
for data in test_data:
    b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in data)
    b_input = b_input_ids.unsqueeze(0)  
    b_attn_mask = b_attn_mask.unsqueeze(0)
    
    with torch.no_grad():
        outputs = bert_classifier(b_input,b_attn_mask)
        pre = outputs.argmax(dim=1)
        test_result.append([b_labels.item(), pre.item(), tokenizer.convert_ids_to_tokens(b_input_ids)])

# 写入csv文件
df = pd.DataFrame(test_result)
df.to_csv('test_result.csv',index=False, header=['id', 'label','text'])

开始测试...


In [17]:
import pandas as pd
df = pd.read_csv('test_result.csv')
df[df.id!=df.label]

Unnamed: 0,id,label,text
6,0,1,"['[CLS]', '乱', '世', '佳', '人', '[SEP]', '[PAD]'..."
11,1,0,"['[CLS]', '下', '场', '加', '油', '吧', '[SEP]', '[..."
22,0,2,"['[CLS]', '刘', '军', '不', '服', '[SEP]', '[PAD]'..."
23,2,0,"['[CLS]', '打', '屎', '棍', '[SEP]', '[PAD]', '[P..."
30,0,1,"['[CLS]', '我', '要', '尊', '严', '[SEP]', '[PAD]'..."
...,...,...,...
6522,0,2,"['[CLS]', '78', '##9', '玩', '不', '起', '[SEP]',..."
6524,1,0,"['[CLS]', '应', '该', '叫', '先', '知', '[SEP]', '[..."
6525,2,0,"['[CLS]', '3', '输', '了', '。', '[SEP]', '[PAD]'..."
6528,0,2,"['[CLS]', '又', '来', '水', '了', '[SEP]', '[PAD]'..."


In [18]:
len(df[df.id==df.label])/len(df)

0.8497322111706197