In [20]:
# coding: UTF-8
import time
import torch
import numpy as np
from train_eval import train, init_network
from importlib import import_module
from utils import build_dataset, build_iterator, get_time_dif
from transformers import BertForSequenceClassification, AdamW, BertConfig



if __name__ == '__main__':
    dataset = 'dataset'  # 数据集
    model_name = 'bert'  # bert
    x = import_module(model_name)
    config = x.Config(dataset)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    print("Loading data...")
    train_data, dev_data, test_data = build_dataset(config)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


320it [00:00, 3189.21it/s]

Loading data...


96456it [00:24, 3982.14it/s]
12051it [00:03, 3596.95it/s]
12060it [00:02, 4030.46it/s]

Time usage: 0:00:31





In [13]:
config.num_epochs

3

In [7]:
model = BertForSequenceClassification.from_pretrained(
    config.bert_path, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = config.num_classes, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

Some weights of the model checkpoint at ./bert_pretrain were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at .

In [15]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_iter) * config.num_epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [37]:
start_time = time.time()
model.train()

total_batch = 0  # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0  # 记录上次验证集loss下降的batch数
flag = False  # 记录是否很久没有效果提升

for epoch in range(config.num_epochs):
    model.train()
    print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
    for i, (trains, labels) in enumerate(train_iter):
        
        b_input_ids = trains[0].to(config.device)
        b_input_mask = trains[2].to(config.device)
        b_labels = labels.to(config.device)
        
        model.zero_grad()   
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                        labels=b_labels)
        loss = outputs.loss
#         total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        
        if total_batch % 100 == 0:
            # 每多少轮输出在训练集和验证集上的效果
            predic = torch.max(outputs.logits, 1)[1].cpu()
            train_acc = metrics.accuracy_score(b_labels, predic)
            dev_acc, dev_loss = evaluate(config, model, dev_iter)
            if dev_loss < dev_best_loss:
                dev_best_loss = dev_loss
                torch.save(model.state_dict(), config.save_path)
                improve = '*'
                last_improve = total_batch
            else:
                improve = ''
            time_dif = get_time_dif(start_time)
            msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
            print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
            model.train()
        total_batch += 1
        if total_batch - last_improve > config.require_improvement:
            # 验证集loss超过1000batch没下降，结束训练
            print("No optimization for a long time, auto-stopping...")
            flag = True
            break
    if flag:
        break
        
        
def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            
            b_input_ids = texts[0].to(config.device)
            b_input_mask = texts[2].to(config.device)
            b_labels = labels.to(config.device)
            
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            loss = outputs.loss
            loss_total += loss.item()
            
            predic = torch.max(outputs.logits, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, b_labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

Epoch [1/3]


IndexError: Target 22 is out of bounds.

In [28]:
>>> from transformers import BertTokenizer, BertForSequenceClassification
>>> import torch

>>> tokenizer = BertTokenizer.from_pretrained('/home/test2/DeepLearning/tensorflow_2.x/BERT/dataset/bert-base-cased')
>>> model = BertForSequenceClassification.from_pretrained('/home/test2/DeepLearning/tensorflow_2.x/BERT/dataset/bert-base-cased')

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
>>> outputs = model(**inputs)
>>> loss = outputs.loss
>>> logits = outputs.logits

Some weights of the model checkpoint at /home/test2/DeepLearning/tensorflow_2.x/BERT/dataset/bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio