In [1]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
import torch
from tqdm import tqdm
import time

In [2]:
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

load data

In [3]:
df_train = pd.read_csv("../data/translated_data_train.csv")
df_test = pd.read_csv("../data/translated_data_test.csv")

load tokenizer

In [4]:
# https://albertauyeung.github.io/2020/06/19/bert-tokenization.html/
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

concatenate and add special tokens

In [5]:
df_train["concatenated"] = "[CLS]" + df_train["premise"] + "[SEP]" + df_train["hypothesis"]
df_test["concatenated"] = "[CLS]" + df_test["premise"] + "[SEP]" + df_test["hypothesis"]

tokenize (wordPiece)

In [6]:
df_train["tokens"] = df_train["concatenated"].progress_apply(tokenizer.tokenize)
df_test["tokens"] = df_test["concatenated"].progress_apply(tokenizer.tokenize)

100%|██████████| 12120/12120 [00:10<00:00, 1177.11it/s]
100%|██████████| 5195/5195 [00:04<00:00, 1130.31it/s]


get maximum tokens list length

In [7]:
max_length = max(df_train["tokens"].apply(len).max(), df_test["tokens"].apply(len).max())
print("max length =", max_length)

max length = 308


add padding to make everything uniform length_wise

In [8]:
def add_padding(tokens):
    tokens = tokens + ["[PAD]"] * (max_length - len(tokens))
    return tokens

In [9]:
df_train["tokens_pad"] = df_train["tokens"].progress_apply(add_padding)
df_test["tokens_pad"] = df_test["tokens"].progress_apply(add_padding)

100%|██████████| 12120/12120 [00:00<00:00, 122829.82it/s]
100%|██████████| 5195/5195 [00:00<00:00, 92044.85it/s]


get tokens_index

In [10]:
# https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca
# Map the token strings to their vocabulary indeces.
df_train["indexed_tokens"] = df_train["tokens_pad"].progress_apply(tokenizer.convert_tokens_to_ids)
df_test["indexed_tokens"] = df_test["tokens_pad"].progress_apply(tokenizer.convert_tokens_to_ids)

100%|██████████| 12120/12120 [00:02<00:00, 4059.51it/s]
100%|██████████| 5195/5195 [00:01<00:00, 4009.79it/s]


get segments_ids (0 for first sentence and `[SEP]`, 1 for the rest)

In [11]:
def get_segments_ids(tokens: list):
    segments_ids = [0] * (tokens.index("[SEP]") + 1) + [1] * (len(tokens) - (tokens.index("[SEP]") + 1))
    return segments_ids

In [12]:
df_train["segments_ids"] = df_train["tokens_pad"].progress_apply(get_segments_ids)
df_test["segments_ids"] = df_test["tokens_pad"].progress_apply(get_segments_ids)

100%|██████████| 12120/12120 [00:00<00:00, 108413.93it/s]
100%|██████████| 5195/5195 [00:00<00:00, 17992.12it/s]


convert to tensors

In [13]:
df_train["tokens_tensor"] = df_train["indexed_tokens"].progress_apply(lambda x: torch.tensor([x]))
df_test["tokens_tensor"] = df_test["indexed_tokens"].progress_apply(lambda x: torch.tensor([x]))

100%|██████████| 12120/12120 [00:00<00:00, 34361.04it/s]
100%|██████████| 5195/5195 [00:00<00:00, 30860.91it/s]


In [14]:
df_train["segments_tensor"] = df_train["segments_ids"].progress_apply(lambda x: torch.tensor([x]))
df_test["segments_tensor"] = df_test["segments_ids"].progress_apply(lambda x: torch.tensor([x]))

100%|██████████| 12120/12120 [00:00<00:00, 34534.53it/s]
100%|██████████| 5195/5195 [00:00<00:00, 30454.08it/s]


load model

In [15]:
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

get embedding

In [17]:
with torch.no_grad():
    df_train["sentence_embedding"] = df_train[["tokens_tensor", "segments_tensor"]].progress_apply(
        lambda x: torch.mean(model(x["tokens_tensor"], x["segments_tensor"])[2][-2][0], dim=0), axis=1)

100%|██████████| 12120/12120 [1:06:11<00:00,  3.05it/s]


In [26]:
with torch.no_grad():
    df_test["sentence_embedding"] = df_test[["tokens_tensor", "segments_tensor"]].progress_apply(
        lambda x: torch.mean(model(x["tokens_tensor"], x["segments_tensor"])[2][-2][0], dim=0), axis=1)

100%|██████████| 5195/5195 [32:00<00:00,  2.70it/s]


In [42]:
df_train["sentence_embedding"] = df_train["sentence_embedding"].apply(lambda x : list(np.array(x)))
df_test["sentence_embedding"] = df_test["sentence_embedding"].apply(lambda x : list(np.array(x)))

In [44]:
df_train[["id", "premise", "hypothesis", "lang_abv", "language", "label", "sentence_embedding"]].to_csv(
    "../data/train_embedded_bert.csv", index=False)

In [45]:
df_test[["id", "premise", "hypothesis", "lang_abv", "language", "sentence_embedding"]].to_csv(
    "../data/test_embedded_bert.csv", index=False)