!pip install transformers==2.9.0
!pip install pandas==0.25.3
!pip install numpy==1.17.4
!pip install scikit-learn==0.22.1
!pip install torch==1.5.0
!pip install nltk==3.4.5
!pip install unidecode==1.1.1

In [1]:
import os, sys
sys.path.append('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from modules.word_classification import BertForWordClassification
from utils.forward_fn import forward_word_classification
from utils.metrics import ner_metrics_fn
from utils.data_utils import NerGritDataset, NerDataLoader

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [3]:
# Set random seed
set_seed(26092020)

# Load Model

In [4]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = NerGritDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

BertForWordClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [6]:
count_param(model)

124446727

# Prepare Dataset

In [7]:
train_dataset_path = '../dataset/ner-grit/train_preprocess.txt'
valid_dataset_path = '../dataset/ner-grit/valid_preprocess.txt'
test_dataset_path = '../dataset/ner-grit/test_preprocess.txt'

In [8]:
train_dataset = NerGritDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = NerGritDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = NerGritDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = NerDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = NerDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)  
test_loader = NerDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

In [9]:
w2i, i2w = NerGritDataset.LABEL2INDEX, NerGritDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'I-PERSON': 0, 'B-ORGANISATION': 1, 'I-ORGANISATION': 2, 'B-PLACE': 3, 'I-PLACE': 4, 'O': 5, 'B-PERSON': 6}
{0: 'I-PERSON', 1: 'B-ORGANISATION', 2: 'I-ORGANISATION', 3: 'B-PLACE', 4: 'I-PLACE', 5: 'O', 6: 'B-PERSON'}


# Fine Tuning & Evaluation

In [10]:
optimizer = optim.Adam(model.parameters(), lr=5e-6)
model = model.cuda()

In [11]:
# Train
n_epochs = 8
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = ner_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = ner_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:1.2932 LR:0.00001000: 100%|██████████| 105/105 [00:19<00:00,  5.41it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:1.2932 ACC:0.83 F1:0.19 REC:0.09 PRE:0.12 LR:0.00001000


VALID LOSS:1.2461 ACC:0.86 F1:0.40 REC:0.16 PRE:0.23: 100%|██████████| 14/14 [00:01<00:00, 11.31it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:1.2461 ACC:0.86 F1:0.40 REC:0.16 PRE:0.23


(Epoch 2) TRAIN LOSS:1.0421 LR:0.00001000: 100%|██████████| 105/105 [00:19<00:00,  5.29it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 2) TRAIN LOSS:1.0421 ACC:0.90 F1:0.55 REC:0.49 PRE:0.52 LR:0.00001000


VALID LOSS:1.1256 ACC:0.92 F1:0.59 REC:0.68 PRE:0.63: 100%|██████████| 14/14 [00:01<00:00, 10.95it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:1.1256 ACC:0.92 F1:0.59 REC:0.68 PRE:0.63


(Epoch 3) TRAIN LOSS:0.9391 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.08it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 3) TRAIN LOSS:0.9391 ACC:0.93 F1:0.67 REC:0.67 PRE:0.67 LR:0.00001000


VALID LOSS:1.0001 ACC:0.94 F1:0.67 REC:0.77 PRE:0.72: 100%|██████████| 14/14 [00:01<00:00, 10.98it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 3) VALID LOSS:1.0001 ACC:0.94 F1:0.67 REC:0.77 PRE:0.72


(Epoch 4) TRAIN LOSS:0.8805 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.20it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 4) TRAIN LOSS:0.8805 ACC:0.95 F1:0.73 REC:0.73 PRE:0.73 LR:0.00001000


VALID LOSS:0.9148 ACC:0.94 F1:0.71 REC:0.77 PRE:0.74: 100%|██████████| 14/14 [00:01<00:00, 10.74it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 4) VALID LOSS:0.9148 ACC:0.94 F1:0.71 REC:0.77 PRE:0.74


(Epoch 5) TRAIN LOSS:0.8131 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.06it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 5) TRAIN LOSS:0.8131 ACC:0.96 F1:0.76 REC:0.79 PRE:0.78 LR:0.00001000


VALID LOSS:0.8996 ACC:0.94 F1:0.70 REC:0.77 PRE:0.74: 100%|██████████| 14/14 [00:01<00:00, 11.23it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 5) VALID LOSS:0.8996 ACC:0.94 F1:0.70 REC:0.77 PRE:0.74


(Epoch 6) TRAIN LOSS:0.7623 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.06it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 6) TRAIN LOSS:0.7623 ACC:0.97 F1:0.82 REC:0.84 PRE:0.83 LR:0.00001000


VALID LOSS:1.0217 ACC:0.95 F1:0.74 REC:0.76 PRE:0.75: 100%|██████████| 14/14 [00:01<00:00, 11.39it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 6) VALID LOSS:1.0217 ACC:0.95 F1:0.74 REC:0.76 PRE:0.75


(Epoch 7) TRAIN LOSS:0.7155 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.09it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 7) TRAIN LOSS:0.7155 ACC:0.97 F1:0.84 REC:0.86 PRE:0.85 LR:0.00001000


VALID LOSS:0.7764 ACC:0.95 F1:0.74 REC:0.81 PRE:0.78: 100%|██████████| 14/14 [00:01<00:00, 11.13it/s]
  0%|          | 0/105 [00:00<?, ?it/s]

(Epoch 7) VALID LOSS:0.7764 ACC:0.95 F1:0.74 REC:0.81 PRE:0.78


(Epoch 8) TRAIN LOSS:0.6796 LR:0.00001000: 100%|██████████| 105/105 [00:20<00:00,  5.08it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

(Epoch 8) TRAIN LOSS:0.6796 ACC:0.98 F1:0.86 REC:0.88 PRE:0.87 LR:0.00001000


VALID LOSS:0.7262 ACC:0.95 F1:0.74 REC:0.81 PRE:0.77: 100%|██████████| 14/14 [00:01<00:00, 11.27it/s]


(Epoch 8) VALID LOSS:0.7262 ACC:0.95 F1:0.74 REC:0.81 PRE:0.77


In [12]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_word_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('prediction.csv', index=False)

print(df)

100%|██████████| 14/14 [00:01<00:00, 12.16it/s]

     index                                              label
0        0  [B-PERSON, I-PERSON, O, O, O, O, B-ORGANISATIO...
1        1  [O, O, O, O, O, O, O, B-PERSON, O, O, O, O, O,...
2        2  [O, O, O, O, O, O, O, O, B-ORGANISATION, I-ORG...
3        3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4        4  [O, O, O, O, O, O, B-PERSON, I-PERSON, O, O, O...
..     ...                                                ...
204    204  [O, O, O, O, O, O, B-PLACE, O, O, O, O, B-PLAC...
205    205      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]
206    206  [O, O, O, O, B-PLACE, I-PLACE, O, O, O, B-PLAC...
207    207  [O, O, O, O, O, O, O, B-PERSON, O, O, O, B-PLA...
208    208  [O, O, O, O, O, O, O, O, B-PLACE, I-PLACE, O, ...

[209 rows x 2 columns]



