In [1]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import classification_report
import torch
from tqdm import tqdm
import time

In [2]:
# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

load data

In [15]:
df_train = pd.read_csv("../data/translated_data_train.csv")
df_test = pd.read_csv("../data/translated_data_test.csv")

load tokenizer

In [16]:
# https://albertauyeung.github.io/2020/06/19/bert-tokenization.html/
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [18]:
df_train[df_train["id"] == "e92393740b"]

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
2049,e92393740b,ผู้แสวงบุญจะซื้อเค้กน้ำผึ้งให้งูตัวนี้และวางเค...,งูจะกินผู้แสวงบุญซึ่งพยายามเข้าถึงพระวิหารเท่า...,th,Thai,2


concatenate and add special tokens

In [21]:
df_train["concatenated"] = "[CLS]" + df_train["premise"] + "[SEP]" + df_train["hypothesis"]
df_test["concatenated"] = "[CLS]" + df_test["premise"] + "[SEP]" + df_test["hypothesis"]

tokenize (wordPiece)

In [22]:
df_train["tokens"] = df_train["concatenated"].progress_apply(tokenizer.tokenize)
df_test["tokens"] = df_test["concatenated"].progress_apply(tokenizer.tokenize)

100%|██████████| 12120/12120 [00:06<00:00, 2016.75it/s]
100%|██████████| 5195/5195 [00:02<00:00, 2038.83it/s]


In [25]:
df = pd.DataFrame(df_train[df_train["language"] == "English"])
df["len"] = df["concatenated"].apply(len)
df.sort_values(by="len")

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label,concatenated,tokens,len
4758,e010c79cbd,I am not.,I am.,en,English,2,[CLS]I am not.[SEP]I am.,"[[CLS], i, am, not, ., [SEP], i, am, .]",24
4638,b65dc1311f,Waterloo.,D-Day.,en,English,2,[CLS]Waterloo.[SEP]D-Day.,"[[CLS], waterloo, ., [SEP], d, -, day, .]",25
1919,4527529e3d,'Go now.',Stay.,en,English,2,[CLS]'Go now.'[SEP]Stay.,"[[CLS], ', go, now, ., ', [SEP], stay, .]",25
8557,05e75f22ac,'Of course.',Yes.,en,English,0,[CLS]'Of course.'[SEP]Yes.,"[[CLS], ', of, course, ., ', [SEP], yes, .]",26
7058,f7a6243cc7,'I see.',I saw it.,en,English,0,[CLS]'I see.'[SEP]I saw it.,"[[CLS], ', i, see, ., ', [SEP], i, saw, it, .]",27
...,...,...,...,...,...,...,...,...,...
12042,1934ec8b05,and i look back on that and i bought shoes i w...,I am envious of all my debt-free churchgoing f...,en,English,1,[CLS]and i look back on that and i bought shoe...,"[[CLS], and, i, look, back, on, that, and, i, ...",955
6931,2b8b1bd7e9,and i look back on that and i bought shoes i w...,My friends should look towards me as a model o...,en,English,1,[CLS]and i look back on that and i bought shoe...,"[[CLS], and, i, look, back, on, that, and, i, ...",963
5681,1a087ea88f,yes they would they just wouldn't be able to o...,I am glad our generation has no debt.,en,English,2,[CLS]yes they would they just wouldn't be able...,"[[CLS], yes, they, would, they, just, wouldn, ...",1014
4843,373902d224,yes they would they just wouldn't be able to o...,Life will be great for subsequent generations ...,en,English,1,[CLS]yes they would they just wouldn't be able...,"[[CLS], yes, they, would, they, just, wouldn, ...",1059


In [30]:
df["sum"] = df["tokens"].apply(lambda x : np.sum([("##" in tok) for tok in x]))
df[df["sum"] > 1].sort_values(by=["sum",'len'])

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label,concatenated,tokens,len,sum
1399,4177df956e,Bork shuddered.,Bork shivered.,en,English,0,[CLS]Bork shuddered.[SEP]Bork shivered.,"[[CLS], bo, ##rk, shuddered, ., [SEP], bo, ##r...",39,2
10876,231fa739a0,"Bauerstein.""",Doctor Bauerstein,en,English,1,"[CLS]Bauerstein.""[SEP]Doctor Bauerstein","[[CLS], bauer, ##stein, ., "", [SEP], doctor, b...",39,2
8418,7cbc3d1337,said San'doro.,San'doro spoke.,en,English,0,[CLS] said San'doro.[SEP]San'doro spoke.,"[[CLS], said, san, ', do, ##ro, ., [SEP], san,...",41,2
9392,ff304862ca,God i'm envious,"Lord, I'm envious.",en,English,0,"[CLS]God i'm envious[SEP]Lord, I'm envious.","[[CLS], god, i, ', m, en, ##vious, [SEP], lord...",43,2
7201,5aeb875f88,Then he sobered.,He had sobered up.,en,English,0,[CLS]Then he sobered.[SEP]He had sobered up.,"[[CLS], then, he, sober, ##ed, ., [SEP], he, h...",44,2
...,...,...,...,...,...,...,...,...,...,...
11062,6c8e685b32,8 A stoichiometry of 1.03 is typical when the ...,A stoichiometry of 1.03 is typical when the FG...,en,English,0,[CLS]8 A stoichiometry of 1.03 is typical when...,"[[CLS], 8, a, st, ##oic, ##hi, ##ome, ##try, o...",267,20
3700,cd1b130fa5,8 A stoichiometry of 1.03 is typical when the ...,A stoichiometry of 1.03 is typical when the FG...,en,English,2,[CLS]8 A stoichiometry of 1.03 is typical when...,"[[CLS], 8, a, st, ##oic, ##hi, ##ome, ##try, o...",271,20
8425,b05536587b,But they also don't seem to mind when the tran...,A Zen temple rock garden is a zen place.,en,English,0,[CLS]But they also don't seem to mind when the...,"[[CLS], but, they, also, don, ', t, seem, to, ...",560,20
4819,629bd6c484,But they also don't seem to mind when the tran...,A Zen temple rock garden is a a place for lots...,en,English,1,[CLS]But they also don't seem to mind when the...,"[[CLS], but, they, also, don, ', t, seem, to, ...",601,20


get maximum tokens list length

In [7]:
max_length = max(df_train["tokens"].apply(len).max(), df_test["tokens"].apply(len).max())
print("max length =", max_length)

max length = 308


add padding to make everything uniform length_wise

In [8]:
def add_padding(tokens):
    tokens = tokens + ["[PAD]"] * (max_length - len(tokens))
    return tokens

In [9]:
df_train["tokens_pad"] = df_train["tokens"].progress_apply(add_padding)
df_test["tokens_pad"] = df_test["tokens"].progress_apply(add_padding)

100%|██████████| 12120/12120 [00:00<00:00, 122829.82it/s]
100%|██████████| 5195/5195 [00:00<00:00, 92044.85it/s]


get tokens_index

In [10]:
# https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca
# Map the token strings to their vocabulary indeces.
df_train["indexed_tokens"] = df_train["tokens_pad"].progress_apply(tokenizer.convert_tokens_to_ids)
df_test["indexed_tokens"] = df_test["tokens_pad"].progress_apply(tokenizer.convert_tokens_to_ids)

100%|██████████| 12120/12120 [00:02<00:00, 4059.51it/s]
100%|██████████| 5195/5195 [00:01<00:00, 4009.79it/s]


get segments_ids (0 for first sentence and `[SEP]`, 1 for the rest)

In [11]:
def get_segments_ids(tokens: list):
    segments_ids = [0] * (tokens.index("[SEP]") + 1) + [1] * (len(tokens) - (tokens.index("[SEP]") + 1))
    return segments_ids

In [12]:
df_train["segments_ids"] = df_train["tokens_pad"].progress_apply(get_segments_ids)
df_test["segments_ids"] = df_test["tokens_pad"].progress_apply(get_segments_ids)

100%|██████████| 12120/12120 [00:00<00:00, 108413.93it/s]
100%|██████████| 5195/5195 [00:00<00:00, 17992.12it/s]


convert to tensors

In [13]:
df_train["tokens_tensor"] = df_train["indexed_tokens"].progress_apply(lambda x: torch.tensor([x]))
df_test["tokens_tensor"] = df_test["indexed_tokens"].progress_apply(lambda x: torch.tensor([x]))

100%|██████████| 12120/12120 [00:00<00:00, 34361.04it/s]
100%|██████████| 5195/5195 [00:00<00:00, 30860.91it/s]


In [14]:
df_train["segments_tensor"] = df_train["segments_ids"].progress_apply(lambda x: torch.tensor([x]))
df_test["segments_tensor"] = df_test["segments_ids"].progress_apply(lambda x: torch.tensor([x]))

100%|██████████| 12120/12120 [00:00<00:00, 34534.53it/s]
100%|██████████| 5195/5195 [00:00<00:00, 30454.08it/s]


load model

In [15]:
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

get embedding

In [17]:
with torch.no_grad():
    df_train["sentence_embedding"] = df_train[["tokens_tensor", "segments_tensor"]].progress_apply(
        lambda x: torch.mean(model(x["tokens_tensor"], x["segments_tensor"])[2][-2][0], dim=0), axis=1)

100%|██████████| 12120/12120 [1:06:11<00:00,  3.05it/s]


In [26]:
with torch.no_grad():
    df_test["sentence_embedding"] = df_test[["tokens_tensor", "segments_tensor"]].progress_apply(
        lambda x: torch.mean(model(x["tokens_tensor"], x["segments_tensor"])[2][-2][0], dim=0), axis=1)

100%|██████████| 5195/5195 [32:00<00:00,  2.70it/s]


In [42]:
df_train["sentence_embedding"] = df_train["sentence_embedding"].apply(lambda x : list(np.array(x)))
df_test["sentence_embedding"] = df_test["sentence_embedding"].apply(lambda x : list(np.array(x)))

In [44]:
df_train[["id", "premise", "hypothesis", "lang_abv", "language", "label", "sentence_embedding"]].to_csv(
    "../data/train_embedded_bert.csv", index=False)

In [45]:
df_test[["id", "premise", "hypothesis", "lang_abv", "language", "sentence_embedding"]].to_csv(
    "../data/test_embedded_bert.csv", index=False)