## Import Necessary Packages

In [1]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import BertTokenizer

In [2]:
!pip install transformers
!pip install sentencepiece



In [3]:
import pandas as pd
import re
import torch
# import torch_xla
# import torch_xla.core.xla_model as xm
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence
# from keras.preprocessing.sequence import pad_sequences
import pickle
import os
import numpy as np

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from keras.callbacks import Callback
import tensorflow_hub as hub
import tensorflow as tf
import re

from keras import backend as K
import keras.layers as layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, Embedding, Flatten, Activation, SpatialDropout1D
from keras.layers import Bidirectional, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D
#from keras.optimizers import Adam
from keras.models import Model
from keras.utils import np_utils
#from keras.engine import Layer
from keras import initializers, regularizers, constraints
from keras.layers import *

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.models import load_model
from keras.layers import LSTM, CuDNNGRU, CuDNNLSTM, Add, Reshape
from keras.layers import MaxPooling1D, Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from nltk.tokenize import sent_tokenize, word_tokenize

In [5]:
# use gpu

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Import Dataset

In [6]:
train_df = pd.read_csv('../input/processed-snli/processed_dataset/train_df.csv')
val_df   = pd.read_csv('../input/processed-snli/processed_dataset/val_df.csv')
test_df = pd.read_csv('../input/processed-snli/processed_dataset/test_df.csv')

## BERT Embedding

In [9]:
class MNLIDataBert(Dataset):
    def __init__(self, train_df, val_df):
        self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
        self.train_df = train_df
        self.val_df = val_df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.train_data = None
        self.val_data = None
        self.init_data()
        
    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.val_data = self.load_data(self.val_df)
        
    def load_data(self, df):
        MAX_LEN = 512
        token_ids = []
        mask_ids = []
        seg_ids = []
        y = []

        premise_list = df['sentence1'].to_list()
        hypothesis_list = df['sentence2'].to_list()
        label_list = df['gold_label'].to_list()
        
        for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
            premise_len = len(premise_id)
            hypothesis_len = len(hypothesis_id)

            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
            attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values
            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            y.append(self.label_dict[label])
            
        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
            
        return dataset
        
    def get_data_loaders(self, batch_size=32, shuffle=True):
            train_loader = DataLoader(
                self.train_data,
                shuffle=shuffle,
                batch_size=batch_size
            )
            
            val_loader = DataLoader(
                self.val_data,
                shuffle=shuffle,
                batch_size=batch_size
            )
            
            return train_loader, val_loader

In [10]:
mnli_dataset = MNLIDataBert(train_df, val_df)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

99884
9842


In [11]:
train_loader, val_loader = mnli_dataset.get_data_loaders(batch_size=16)

## Load BERT Model

In [12]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Fine-tuning of BERT

In [13]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [14]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,484,547 trainable parameters


In [16]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    labels_flat = labels_flat.cpu().detach().numpy() 
    return np.sum(pred_flat == labels_flat), pred_flat

In [24]:
def multi_acc(y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

## Compiling the model

In [26]:
import time

EPOCHS = 5

train_loss_list = []
train_acc_list = []
val_loss_list = []
val_acc_list = []

def train(model, train_loader, val_loader, optimizer):
    total_step = len(train_loader)
    
    for epoch in range(EPOCHS):
        start = time.time()
        model.train()
        total_train_loss = 0
        total_train_acc  = 0
        
        for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
            optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            labels = y.to(device)
            # prediction = model(pair_token_ids, mask_ids, seg_ids)
            loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
            
            # loss = criterion(prediction, labels)
            acc = multi_acc(prediction, labels)
            
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            total_train_acc  += acc.item()
            
        train_acc  = total_train_acc/len(train_loader)
        train_loss = total_train_loss/len(train_loader)
        train_loss_list.append(train_loss)
        train_acc_list.append(train_acc)
        
        model.eval()
        
        total_val_acc  = 0
        total_val_loss = 0
        
        with torch.no_grad():
            for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
                optimizer.zero_grad()
                pair_token_ids = pair_token_ids.to(device)
                mask_ids = mask_ids.to(device)
                seg_ids = seg_ids.to(device)
                labels = y.to(device)

        # prediction = model(pair_token_ids, mask_ids, seg_ids)
                loss, prediction = model(pair_token_ids, 
                                         token_type_ids=seg_ids, 
                                         attention_mask=mask_ids, 
                                         labels=labels).values()
                
                acc = multi_acc(prediction, labels)
                total_val_loss += loss.item()
                total_val_acc  += acc.item()
        
        val_acc  = total_val_acc/len(val_loader)
        val_loss = total_val_loss/len(val_loader)
        end = time.time()
        hours, rem = divmod(end-start, 3600)
        minutes, seconds = divmod(rem, 60)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)
        
        print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [27]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.4830 train_acc: 0.8094 | val_loss: 0.3422 val_acc: 0.8701
00:19:18.20
Epoch 2: train_loss: 0.3037 train_acc: 0.8902 | val_loss: 0.3381 val_acc: 0.8732
00:19:16.89
Epoch 3: train_loss: 0.2090 train_acc: 0.9276 | val_loss: 0.4177 val_acc: 0.8694
00:19:18.03
Epoch 4: train_loss: 0.1502 train_acc: 0.9498 | val_loss: 0.4262 val_acc: 0.8727
00:19:18.45
Epoch 5: train_loss: 0.1148 train_acc: 0.9620 | val_loss: 0.4630 val_acc: 0.8768
00:19:19.66


## Plot Performance
The final performance is plot on Kaggle: https://www.kaggle.com/neverseepython/bert-snli?scriptVersionId=82688114

In [None]:
import matplotlib
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(10,8))
plt.title("The trend of Training loss")
plt.xlabel("Batch")
plt.ylabel("Training Loss")
plt.plot(train_loss_list)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title("The trend of Validation loss")
plt.xlabel("Batch")
plt.ylabel("Validation Loss")
plt.plot(val_loss_list)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title("The trend of Validation Accuracy")
plt.xlabel("Batch")
plt.ylabel("Validation Accuracy")
plt.plot(val_acc_list)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.title("The trend of Training Accuracy")
plt.xlabel("Batch")
plt.ylabel("Training Accuracy")
plt.plot(train_acc_list)
plt.show()