# Relation Classification based on Bert

## For Colab Environment

In [1]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    !git clone https://github.com/Molin-L/RLRC.git
    !pip install transformers
    !pip install wandb
    !wandb login f8b9d4c1a91d9a60c0dfe9934d5d550598750c33
    %cd RLRC

fatal: destination path 'RLRC' already exists and is not an empty directory.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m
/content/RLRC


In [2]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import torchtext
import matplotlib.pyplot as plt

import copy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from utils import format_time
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import time
import wandb
import random
import RLRC_dataloader
from RLRC_Bert_model import RC_BERT


## Load Data

## Bert Model Setting

In [3]:
print('Loading pretrained tokenizer...')
pretrain_model = ["bert-base-uncased", "distilbert-base-uncased"]
pretrain_model = pretrain_model[1]  
additional_special_tokens = ['<e1>', '</e1>', '<e2>', '</e2>']
tokenizer = DistilBertTokenizer.from_pretrained(pretrain_model, do_lower_case=False)
print(len(tokenizer))
tokenizer.add_tokens(['<e1>', '</e1>', '<e2>', '</e2>'])
print(len(tokenizer))
e1_id = tokenizer.convert_tokens_to_ids('<e1>')
e2_id = tokenizer.convert_tokens_to_ids('<e2>')

print(e1_id, e2_id)
assert e1_id != e2_id != 1

Loading pretrained tokenizer...
30522
30526
30522 30524


In [4]:
if IN_COLAB:
    !mkdir data

mkdir: cannot create directory ‘data’: File exists


In [10]:
wandb.init(
        project="RLRC_BERT",
        config={
            'pretrain_model': "distilbert-base-uncased",
            'num_classes': 53,
            'lr': 0.001,
            'dropout': 0.5,
            'epochs': 3
        }
    )
wb_config = wandb.config

In [11]:
train_dataloader, validation_dataloader = RLRC_dataloader.get_dataloader()

Load Bert data from .pkl files...
	62,859 training samples
	6,985 validation samples


In [12]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

loss_fn = nn.CrossEntropyLoss()

In [13]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [14]:
config = wb_config
# print(config)
model = RC_BERT(config)
optimizer = AdamW(
    model.parameters(),
    lr=wb_config['lr'],
    eps=1e-8
)
total_steps = len(train_dataloader) * config['epochs']
print(total_steps)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,  # Default value in run_glue.py
                                            num_training_steps=total_steps)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
model.to(device)
epochs = config['epochs']
wandb.watch(model)
for epoch_i in range(config['epochs']):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        # Progress update every 40 batches.
        batch_counts += 1
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        logits = model(b_input_ids, b_input_mask)
        loss = loss_fn(logits, b_labels)
        batch_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = batch_loss / batch_counts

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            logits = model(b_input_ids,
                            attention_mask=b_input_mask,
                            )
        loss = loss_fn(logits, b_labels)
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

5895
There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB

Training...


HBox(children=(FloatProgress(value=0.0, max=1965.0), HTML(value='')))

  Batch    40  of  1,965.    Elapsed: 0:00:09.
  Batch    80  of  1,965.    Elapsed: 0:00:17.
  Batch   120  of  1,965.    Elapsed: 0:00:26.
  Batch   160  of  1,965.    Elapsed: 0:00:34.
  Batch   200  of  1,965.    Elapsed: 0:00:43.
  Batch   240  of  1,965.    Elapsed: 0:00:51.
  Batch   280  of  1,965.    Elapsed: 0:00:59.
  Batch   320  of  1,965.    Elapsed: 0:01:07.
  Batch   360  of  1,965.    Elapsed: 0:01:16.
  Batch   400  of  1,965.    Elapsed: 0:01:24.
  Batch   440  of  1,965.    Elapsed: 0:01:33.
  Batch   480  of  1,965.    Elapsed: 0:01:41.
  Batch   520  of  1,965.    Elapsed: 0:01:49.
  Batch   560  of  1,965.    Elapsed: 0:01:58.
  Batch   600  of  1,965.    Elapsed: 0:02:06.
  Batch   640  of  1,965.    Elapsed: 0:02:14.
  Batch   680  of  1,965.    Elapsed: 0:02:23.
  Batch   720  of  1,965.    Elapsed: 0:02:31.
  Batch   760  of  1,965.    Elapsed: 0:02:39.
  Batch   800  of  1,965.    Elapsed: 0:02:48.
  Batch   840  of  1,965.    Elapsed: 0:02:56.
  Batch   880

HBox(children=(FloatProgress(value=0.0, max=1965.0), HTML(value='')))

  Batch    40  of  1,965.    Elapsed: 0:00:09.
  Batch    80  of  1,965.    Elapsed: 0:00:17.
  Batch   120  of  1,965.    Elapsed: 0:00:25.
  Batch   160  of  1,965.    Elapsed: 0:00:34.
  Batch   200  of  1,965.    Elapsed: 0:00:42.
  Batch   240  of  1,965.    Elapsed: 0:00:50.
  Batch   280  of  1,965.    Elapsed: 0:00:59.
  Batch   320  of  1,965.    Elapsed: 0:01:07.
  Batch   360  of  1,965.    Elapsed: 0:01:15.
  Batch   400  of  1,965.    Elapsed: 0:01:24.
  Batch   440  of  1,965.    Elapsed: 0:01:32.
  Batch   480  of  1,965.    Elapsed: 0:01:40.
  Batch   520  of  1,965.    Elapsed: 0:01:49.
  Batch   560  of  1,965.    Elapsed: 0:01:57.
  Batch   600  of  1,965.    Elapsed: 0:02:05.
  Batch   640  of  1,965.    Elapsed: 0:02:14.
  Batch   680  of  1,965.    Elapsed: 0:02:22.
  Batch   720  of  1,965.    Elapsed: 0:02:30.
  Batch   760  of  1,965.    Elapsed: 0:02:39.
  Batch   800  of  1,965.    Elapsed: 0:02:47.
  Batch   840  of  1,965.    Elapsed: 0:02:55.
  Batch   880

HBox(children=(FloatProgress(value=0.0, max=1965.0), HTML(value='')))

  Batch    40  of  1,965.    Elapsed: 0:00:08.
  Batch    80  of  1,965.    Elapsed: 0:00:17.
  Batch   120  of  1,965.    Elapsed: 0:00:25.
  Batch   160  of  1,965.    Elapsed: 0:00:34.
  Batch   200  of  1,965.    Elapsed: 0:00:42.
  Batch   240  of  1,965.    Elapsed: 0:00:50.
  Batch   280  of  1,965.    Elapsed: 0:00:59.
  Batch   320  of  1,965.    Elapsed: 0:01:07.
  Batch   360  of  1,965.    Elapsed: 0:01:15.
  Batch   400  of  1,965.    Elapsed: 0:01:24.
  Batch   440  of  1,965.    Elapsed: 0:01:32.
  Batch   480  of  1,965.    Elapsed: 0:01:40.
  Batch   520  of  1,965.    Elapsed: 0:01:49.
  Batch   560  of  1,965.    Elapsed: 0:01:57.
  Batch   600  of  1,965.    Elapsed: 0:02:05.
  Batch   640  of  1,965.    Elapsed: 0:02:14.
  Batch   680  of  1,965.    Elapsed: 0:02:22.
  Batch   720  of  1,965.    Elapsed: 0:02:30.
  Batch   760  of  1,965.    Elapsed: 0:02:39.
  Batch   800  of  1,965.    Elapsed: 0:02:47.
  Batch   840  of  1,965.    Elapsed: 0:02:55.
  Batch   880

In [16]:
outpath = './out/bert_model.pth'

In [18]:
torch.save(model.state_dict(), outpath)

In [23]:
predict_result = []
model.eval()


RC_BERT(
  (Bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features

In [38]:
true_y = []
pred_y = None

for batch in tqdm(train_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():
        logits = model(b_input_ids, b_input_mask)
    logits = logits.detach().cpu()
    label_ids = b_labels.to('cpu').numpy()
    batch_pred_np = logits.numpy()
    if len(true_y) == 0:
        true_y = label_ids
        pred_y = batch_pred_np
    else:
        true_y = np.concatenate((true_y, label_ids), axis=None)
        pred_y = np.concatenate((pred_y, batch_pred_np), axis=None)



HBox(children=(FloatProgress(value=0.0, max=1965.0), HTML(value='')))




In [39]:
np.save('./out/pred_y.npy', pred_y, allow_pickle=True)
np.save('./out/true_y.npy', true_y, allow_pickle=True)

In [42]:
!ls ./data -lh

total 405M
-rw-r--r-- 1 root root  69M Aug 14 13:04 bert_attention_masks.pkl
-rw-r--r-- 1 root root  69M Aug 14 13:04 bert_input_ids.pkl
-rw-r--r-- 1 root root 547K Aug 14 13:04 bert_labels.pkl
-rw-r--r-- 1 root root 254M Aug 14 15:41 model.pth
-rw-r--r-- 1 root root  13M Aug 14 16:34 pred_y.npy
-rw-r--r-- 1 root root 492K Aug 14 16:34 true_y.npy
