# BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding For Text Classification


* Python3.7
* jupyter notebook
* torch           
* sklearn    
* numpy    
* pytorch_pretrained_bert

In [None]:
!pip install transformers

!pip install pytorch_pretrained_bert

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 20.9MB/s eta 0:00:01[K     |▍                               | 20kB 25.5MB/s eta 0:00:01[K     |▋                               | 30kB 28.5MB/s eta 0:00:01[K     |▉                               | 40kB 31.9MB/s eta 0:00:01[K     |█                               | 51kB 33.4MB/s eta 0:00:01[K     |█▎                              | 61kB 25.2MB/s eta 0:00:01[K     |█▌                              | 71kB 26.7MB/s eta 0:00:01[K     |█▊                              | 81kB 21.9MB/s eta 0:00:01[K     |██                              | 92kB 22.5MB/s eta 0:00:01[K     |██▏                             | 102kB 20.5MB/s eta 0:00:01[K     |██▍                             | 112kB 20.5MB/s eta 0:00:01[K     |██▋                             | 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from importlib import import_module
import time
import torch
import numpy as np
import torch.nn as nn
from pytorch_pretrained_bert import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
dir_name = "/content/drive/My Drive/Colab Notebooks/Bert_Lab/" # data folder

In [4]:
bert_name = "bert-base-chinese" ### pretrained bert name

### 1) config including parameters

In [5]:
class Config(object):

    """配置参数"""
    def __init__(self, dir_name,bert_name):
        self.model_name = 'bert'
        self.train_path = dir_name + '/data/dev.txt' # training dataset
        self.dev_path = dir_name + '/data/dev.txt'  # validation dataset
        self.test_path = dir_name + '/data/test.txt' # testing dataset
        self.class_list = [x.strip() for x in open(
            dir_name + '/data/class.txt').readlines()] # list of labels
        self.save_path = dir_name + '/saved_dict/' + self.model_name + '.ckpt' # save_path
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')# cuda device or cpu

        self.require_improvement = 1000  # if there's no any improvement after 1000 batch, stop the training
        self.num_classes = len(self.class_list) # number of labels
        self.num_epochs = 3  # epoch
        self.batch_size = 128 # batch_size
        self.pad_size = 32  # maximum length of input text
        self.learning_rate = 5e-5 
        self.bert_name = bert_name
        self.tokenizer = AutoTokenizer.from_pretrained(self.bert_name)
        self.hidden_size = 768

### 2) Model Loading

In [6]:
config = Config(dir_name,bert_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…




In [7]:
bert_model = BertModel.from_pretrained(config.bert_name).to(config.device)

100%|██████████| 382072689/382072689 [00:07<00:00, 51088317.93B/s]


In [10]:
config.device

device(type='cuda')

In [11]:
class Model(nn.Module):

    def __init__(self, config):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(config.bert_name)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0] # input setence 
        mask = x[2]  # padding mask，for example: [1, 1, 1, 1, 0, 0]
        _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
        out = self.fc(pooled)

        return out

In [12]:
config = Config(dir_name,bert_name)
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # to grarrantee each result is same

start_time = time.time()
print("Loading data...")

Loading data...


### 3) Load Dataset

In [13]:
from tqdm import tqdm
import time
from datetime import timedelta

PAD, CLS = '[PAD]', '[CLS]' 

In [14]:
def build_dataset(config):

    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = lin.split('\t')
                token = config.tokenizer.tokenize(content)
                token = [CLS] + token
                seq_len = len(token)
                mask = []
                token_ids = config.tokenizer.convert_tokens_to_ids(token)

                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids, int(label), seq_len, mask))
        return contents
    
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test

def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device)
    return iter


def get_time_dif(start_time):
    """to obtain the time"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [15]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # the flag to detemine the if len(batches) % self.n_batches != 0:
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)

        
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches

In [16]:
train_data, dev_data, test_data = build_dataset(config)

10000it [00:01, 6890.09it/s]
10000it [00:01, 7995.38it/s]
10000it [00:01, 6744.40it/s]


In [17]:
# train_data[0] ###((token_ids, int(label), seq_len, mask))

In [18]:
train_iter = build_iterator(train_data, config)

In [19]:
#next(iter(train_iter))  ## (x, seq_len, mask), y

In [20]:
trains, labels = next(iter(train_iter))

In [21]:
context  = trains[0]

In [22]:
mask = trains[2]

In [23]:
hidden_states,pooled_output = bert_model(context, attention_mask=mask, output_all_encoded_layers=False)

In [24]:
hidden_states.shape # batch_size, sequence_length,hidden_units

torch.Size([128, 32, 768])

In [25]:
pooled_output.shape #batch_size,,hidden_units

torch.Size([128, 768])

In [26]:
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)

Time usage: 0:00:08


### 4) Train and Evaluate

In [27]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
from pytorch_pretrained_bert.optimization import BertAdam

In [28]:
name, w = next(iter(bert_model.named_parameters()))

In [29]:
name

'embeddings.word_embeddings.weight'

In [30]:
w.size()

torch.Size([21128, 768])

In [35]:
# initialize weights with default xavier method
# why? better for small-size text fine tuninig
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
        if exclude not in name:
            if len(w.size()) < 2:
                continue
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass


def train(config, model, train_iter, dev_iter, test_iter):
    start_time = time.time()
    model.train()
    param_optimizer = list(model.named_parameters())

    ### list of parameter not for being fine tuned
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)
    total_batch = 0  # the number of batch alread pass
    dev_best_loss = float('inf')
    last_improve = 0  # last imporvement of loss
    flag = False  # if it hasn't been imporvement for long time ?
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    _ = test(config, model, test_iter)


def test(config, model, test_iter):
    # test
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    return test_report,test_confusion


def evaluate(config, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

In [33]:
model = Model(config).to(config.device)
train(config, model, train_iter, dev_iter, test_iter)

Epoch [1/3]
Iter:      0,  Train Loss:   2.5,  Train Acc:  5.47%,  Val Loss:   2.5,  Val Acc:  6.15%,  Time: 0:00:27 *
Epoch [2/3]
Iter:    100,  Train Loss:  0.36,  Train Acc: 89.84%,  Val Loss:  0.23,  Val Acc: 93.34%,  Time: 0:02:19 *
Epoch [3/3]
Iter:    200,  Train Loss:  0.15,  Train Acc: 95.31%,  Val Loss: 0.099,  Val Acc: 97.25%,  Time: 0:04:11 *
Test Loss:  0.34,  Test Acc: 90.13%
Precision, Recall and F1-Score...
               precision    recall  f1-score   support

      finance     0.8861    0.8790    0.8825      1000
       realty     0.9027    0.9370    0.9195      1000
       stocks     0.8691    0.7900    0.8277      1000
    education     0.9582    0.9390    0.9485      1000
      science     0.8282    0.8680    0.8477      1000
      society     0.9067    0.9130    0.9098      1000
     politics     0.8847    0.9130    0.8986      1000
       sports     0.9605    0.9480    0.9542      1000
         game     0.9311    0.8790    0.9043      1000
entertainment     0.89

<div id="disqus_thread"></div>
<script>
    /**
     *  RECOMMENDED CONFIGURATION VARIABLES: EDIT AND UNCOMMENT THE SECTION BELOW TO INSERT DYNAMIC VALUES FROM YOUR PLATFORM OR CMS.
     *  LEARN WHY DEFINING THESE VARIABLES IS IMPORTANT: https://disqus.com/admin/universalcode/#configuration-variables
     */
    /*
    var disqus_config = function () {
        this.page.url = PAGE_URL;  // Replace PAGE_URL with your page's canonical URL variable
        this.page.identifier = PAGE_IDENTIFIER; // Replace PAGE_IDENTIFIER with your page's unique identifier variable
    };
    */
    (function() {  // REQUIRED CONFIGURATION VARIABLE: EDIT THE SHORTNAME BELOW
        var d = document, s = d.createElement('script');
        
        s.src = 'https://EXAMPLE.disqus.com/embed.js';  // IMPORTANT: Replace EXAMPLE with your forum shortname!
        
        s.setAttribute('data-timestamp', +new Date());
        (d.head || d.body).appendChild(s);
    })();
</script>
<noscript>Please enable JavaScript to view the <a href="https://disqus.com/?ref_noscript" rel="nofollow">comments powered by Disqus.</a></noscript>