# BERT 中文分类

## 查看GPU是否能用

In [2]:
# 看一看用的什么显卡
!nvidia-smi

Thu Sep  3 06:43:59 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 安装transformers库，并解压数据集

In [None]:
!pip install transformers
!unzip /content/drive/Shared\ drives/hbyscgsg@gmail.com/weibo_senti_100k.zip

In [21]:
import torch 
import pandas as pd

## 通过pandas载入数据集并展示样例

In [5]:
pd_all = pd.read_csv("/content/weibo_senti_100k.csv")
print('评论数目（总体）：%d' % pd_all.shape[0])
print('评论数目（正向）：%d' % pd_all[pd_all.label==1].shape[0])
print('评论数目（负向）：%d' % pd_all[pd_all.label==0].shape[0])
pd_all.sample(20)

评论数目（总体）：119988
评论数目（正向）：59993
评论数目（负向）：59995


Unnamed: 0,label,review
51490,1,我不确定能否活到那会[哼] //@c哈哈:我也来凑个热闹，@大大大老沈 和你永远幸福哦~~~...
20456,1,又长姿势了！意大利面的各种名称以及对照图~不懂的mark下~[爱你]
30151,1,台湾水果太好吃了，连我不爱吃的小番茄都这么好粗[鼓掌][鼓掌]走高速吃水果好苏胡[馋嘴][馋...
18492,1,//@上海中期许思达:// @DJ隋怡 : [哈哈]// @CRI刘彦 : 您咋这么经典？[...
59939,1,幸福幸福无比幸福[哈哈]//@郝郝的好妈妈:@雨过天晴之霁_泽雯 @左咖_晒着太阳儿 @北京鸭脖
84242,0,一不小心，把今天当成父亲节啦[晕]，既备之，则做之！为了今晚的Pre-Fathers Day...
66456,0,每项开销都留发票是个好习惯，起码你能粗算一下月平均个人花费是多少[汗]（问题是就我这数学耐力...
56156,1,JIN AIR，真航空，真爱你。 //@大韩航空:那一抹清新亮丽的风景和纯真的笑容，JIN ...
20494,1,哈哈哈小礼物[哈哈][哈哈][哈哈]
98809,0,我咋活的这么纠结啊[泪]


## 对数据预处理下

In [6]:
train_size = int(0.8 * 119988)
test_size = 119988 - train_size
train_indices,test_indices = torch.utils.data.random_split(pd_all,[train_size,test_size])

In [7]:
sentences = pd_all['review']
labels = pd_all['label']

In [8]:
train_labels=[]
for i in train_indices.indices:
  train_labels.append(labels[i])

train_sentences=[]
for i in train_indices.indices:
  train_sentences.append(sentences[i])

test_labels=[]
for i in test_indices.indices:
  test_labels.append(labels[i])

test_sentences=[]
for i in test_indices.indices:
  test_sentences.append(sentences[i])

In [9]:
len(train_labels)

95990

## 载入BERT中文的tokenizer

In [10]:
from transformers import BertTokenizer
bert_model = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(bert_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [11]:
# Print the original sentence.
print(' Original: ', train_sentences[1])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_sentences[1]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[1])))

 Original:  Life isnt about waiting for the storm to pass, Its about learning to dance in the rain//@娜小鱼儿:早安，听首好歌，干活去喽~[抱抱]
Tokenized:  ['[UNK]', 'is', '##nt', 'about', 'w', '##ai', '##ting', 'for', 'the', 'st', '##orm', 'to', 'pass', ',', '[UNK]', 'about', 'learning', 'to', 'dance', 'in', 'the', 'rain', '/', '/', '@', '娜', '小', '鱼', '儿', ':', '早', '安', '，', '听', '首', '好', '歌', '，', '干', '活', '去', '喽', '~', '[', '抱', '抱', ']']
Token IDs:  [100, 8310, 8511, 9053, 165, 8982, 9107, 8330, 8174, 8811, 10530, 8228, 9703, 117, 100, 9053, 12315, 8228, 12371, 8217, 8174, 11873, 120, 120, 137, 2025, 2207, 7824, 1036, 131, 3193, 2128, 8024, 1420, 7674, 1962, 3625, 8024, 2397, 3833, 1343, 1617, 172, 138, 2849, 2849, 140]


## 对每句话转换成的序列做预处理


MAX_LEN设置为128

好像没有超过128的

没达到自动填充

In [12]:
MAX_LEN=128

input_ids = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN,truncation=True) for sent in train_sentences]
test_input_ids = [tokenizer.encode(sent,add_special_tokens=True,max_length=MAX_LEN,truncation=True) for sent in test_sentences]

from keras.preprocessing.sequence import pad_sequences
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print("finish")


Padding token: "[PAD]", ID: 0
finish


## 设置每句话的的mask给BERT训练

In [13]:
 # Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

    
test_attention_masks = []

# For each sentence...
for sent in test_input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    test_attention_masks.append(att_mask)


## 对数据集划分

In [14]:
from sklearn.model_selection import train_test_split
labels = train_labels
# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2020, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2020, test_size=0.1)

## 创建dataloader，是数据分批次进行训练
防止内存不足产生问题


In [15]:
import torch
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs=torch.tensor(test_input_ids)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels=torch.tensor(test_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks=torch.tensor(test_attention_masks)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## 创建用于分类中文的BERT模型（二分类）

In [16]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## 创建优化器，以及制定一些参数

In [17]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## 创建评估函数和一个格式化时间的函数

In [18]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## 开始训练！

In [19]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):# epoches
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  5,400.    Elapsed: 0:00:15.
  Batch    80  of  5,400.    Elapsed: 0:00:30.
  Batch   120  of  5,400.    Elapsed: 0:00:46.
  Batch   160  of  5,400.    Elapsed: 0:01:02.
  Batch   200  of  5,400.    Elapsed: 0:01:19.
  Batch   240  of  5,400.    Elapsed: 0:01:35.
  Batch   280  of  5,400.    Elapsed: 0:01:51.
  Batch   320  of  5,400.    Elapsed: 0:02:07.
  Batch   360  of  5,400.    Elapsed: 0:02:23.
  Batch   400  of  5,400.    Elapsed: 0:02:39.
  Batch   440  of  5,400.    Elapsed: 0:02:55.
  Batch   480  of  5,400.    Elapsed: 0:03:11.
  Batch   520  of  5,400.    Elapsed: 0:03:27.
  Batch   560  of  5,400.    Elapsed: 0:03:43.
  Batch   600  of  5,400.    Elapsed: 0:04:00.
  Batch   640  of  5,400.    Elapsed: 0:04:16.
  Batch   680  of  5,400.    Elapsed: 0:04:32.
  Batch   720  of  5,400.    Elapsed: 0:04:48.
  Batch   760  of  5,400.    Elapsed: 0:05:04.
  Batch   800  of  5,400.    Elapsed: 0:05:21.
  Batch   840  of  5,400.    Elapsed: 0:05:37.


## 评估


In [20]:
t0 = time.time()
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
for batch in test_dataloader:
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():        
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # Get the "logits" output by the model. The "logits" are the output
    # values prior to applying an activation function like the softmax.
    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Calculate the accuracy for this batch of test sentences.
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    # Accumulate the total accuracy.
    eval_accuracy += tmp_eval_accuracy

    # Track the number of batches
    nb_eval_steps += 1
print("  Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
print("  Test took: {:}".format(format_time(time.time() - t0)))

  Accuracy: 0.9787
  Test took: 0:03:32
