In [58]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [59]:
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import numpy as np

In [60]:
class bertconfig():
    def __init__(self):
        self.project = 'THUC'
        self.bert_path = './bert_pretrain'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.class_list = [x.strip() for x in open('./THUC/data/class.txt').readlines()]
        self.pad_size = 32
        self.batch_size = 128
        self.device = 'cuda'
        self.hidden_size = 768
        self.num_classes = len(self.class_list)
        self.learning_rate = 1e-5
        self.num_epochs = 10
        self.save_path = './THUC/'+ self.project+'.ckpt'

In [61]:
config = bertconfig()
config.num_classes

10

In [62]:
def build_dataset(pad_size):
    def load_data(path,pad_size):
        input = []
        label = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                content = line.strip().split('\t')[0]
                label = line.strip().split('\t')[1]
                token = config.tokenizer.tokenize(content)
                token = ['CLS']+token
                token_ids = config.tokenizer.convert_tokens_to_ids(token)
                seq_len = len(token_ids)
                if len(token_ids)<pad_size:
                    token_ids = token_ids+[0]*(pad_size-len(token_ids))
                    mask = [1]*seq_len+[0]*(pad_size-seq_len)
                else:
                    token_ids = token_ids[:pad_size]
                    seq_len = pad_size
                    mask = [1]*seq_len
                input.append((token_ids,int(label),seq_len,mask))
        return input
    train_data = load_data('./THUC/data/train.txt',pad_size)
    test_data = load_data('./THUC/data/test.txt',pad_size)
    dev_data = load_data('./THUC/data/dev.txt',pad_size)
    return train_data,test_data,dev_data

In [63]:
train_data,test_data,dev_data = build_dataset(config.pad_size)

33590it [00:07, 4333.41it/s]
10000it [00:02, 4005.17it/s]
10000it [00:02, 4188.69it/s]


In [64]:
class DatasetIterator(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False 
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device
        
    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


def build_iterator(dataset, config):
    iter = DatasetIterator(dataset, config.batch_size, config.device)
    return iter

In [65]:
train_iter = build_iterator(train_data, config)
test_iter = build_iterator(test_data, config)
dev_iter = build_iterator(dev_data, config)

In [66]:
class Model(nn.Module): 

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_path)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]  # 输入的句子
        mask = x[2]  # mask
        _ = self.bert(context, attention_mask=mask)
        out = self.fc(_[1])
        return out

In [67]:
model = Model(config).to(config.device)

Some weights of the model checkpoint at ./bert_pretrain were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [68]:
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

In [None]:
best_dev_loss = np.inf
for epoch in range(config.num_epochs):
    idx = 0
    for i, (trains, labels) in tqdm(enumerate(train_iter)):
        model.train()
        outputs = model(trains)
        model.zero_grad()
        loss = F.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()
        #每50个batch update一次
        if idx%50==0:
            true = labels.data.cpu()
            predic = torch.max(outputs.data, 1)[1].cpu()
            train_acc = metrics.accuracy_score(true, predic)
            model.eval()
            loss_total = 0
            predict_all = np.array([], dtype=int)
            labels_all = np.array([], dtype=int)
            with torch.no_grad():
                for texts, labels in dev_iter:
                    outputs = model(texts)
                    loss = F.cross_entropy(outputs, labels)
                    loss_total += loss
                    labels = labels.data.cpu().numpy()
                    predic = torch.max(outputs.data, 1)[1].cpu().numpy()
                    labels_all = np.append(labels_all, labels)
                    predict_all = np.append(predict_all, predic)
            acc = metrics.accuracy_score(labels_all, predict_all)
            print('Epoch:{0}'.format(epoch+1))
            print('Batch:{0}'.format(idx))
            print('Val Loss:{0}'.format(loss_total/len(train_iter)))
            print('Val Accuracy:{0}'.format(acc))
            if loss_total/len(train_iter) < best_dev_loss:
                print('Model Saved!')
                best_dev_loss = loss_total/len(train_iter)
                torch.save(model.state_dict(), config.save_path) 
        idx+=1

0it [00:00, ?it/s]

Epoch:1
Batch:0
Val Loss:0.7173303365707397
Val Accuracy:0.0989
Model Saved!


50it [01:49,  1.44s/it]

Epoch:1
Batch:50
Val Loss:0.37868258357048035
Val Accuracy:0.7672
Model Saved!


100it [03:40,  1.44s/it]

Epoch:1
Batch:100
Val Loss:0.16982828080654144
Val Accuracy:0.852
Model Saved!


150it [05:30,  1.45s/it]

Epoch:1
Batch:150
Val Loss:0.1311345398426056
Val Accuracy:0.8785
Model Saved!


200it [07:21,  1.45s/it]

Epoch:1
Batch:200
Val Loss:0.11852074414491653
Val Accuracy:0.8848
Model Saved!


250it [09:11,  1.45s/it]

Epoch:1
Batch:250
Val Loss:0.10721858590841293
Val Accuracy:0.8972
Model Saved!


263it [10:07,  2.31s/it]
0it [00:00, ?it/s]

Epoch:2
Batch:0
Val Loss:0.10714345425367355
Val Accuracy:0.8967
Model Saved!


50it [01:49,  1.44s/it]