## Prepare dataset for BERT

The paths utilized are defiend in the `path_module.py`

### 1. Import libraries

In [1]:
from modules.bert_module import *
from modules.path_module import *
import os
import joblib
from modules.lipcot_module import LiPCoTdata
from transformers import BertTokenizer
from hugtokencraft import editor # type: ignore

### 2. Load tokenized data

In [2]:
train_data=joblib.load(os.path.join(tokenized_data_savepath,"train_data.joblib"))
test_data=joblib.load(os.path.join(tokenized_data_savepath,"test_data.joblib"))
val_data=joblib.load(os.path.join(tokenized_data_savepath,"val_data.joblib"))

lipcot_model=joblib.load(os.path.join(lipcot_model_savepath,"lipcot_model.joblib"))
tokenizer=editor.load_tokenizer(BertTokenizer,lipcot_model['tokenizer_path'])


Tokenization model Validation: Passed

Tokenizer loaded with vocabulary size: 69


### 3. Prepare data for BERT
We extract only the tokenized text and labels from dataset for BERT

In [3]:
train_text,train_label=prepare_bert_data(train_data)
val_text,val_label=prepare_bert_data(val_data)
test_text,test_label=prepare_bert_data(test_data)

Let's inspect tokenization padding and attention masks

In [4]:
#%% check tokenization padding and attention masks

sentence1 = train_text[0]
sentence2 = train_text[1]
padded_sequences = tokenizer([sentence1, sentence2], padding=True)
print("input_ids")
print(len(padded_sequences["input_ids"][0]))
print(len(padded_sequences["input_ids"][1]))
print(padded_sequences["input_ids"][0])
print(padded_sequences["input_ids"][1])

print("token_type_ids")
print(len(padded_sequences["token_type_ids"][0]))
print(len(padded_sequences["token_type_ids"][1]))
print(padded_sequences["token_type_ids"][0])
print(padded_sequences["token_type_ids"][1])

print("attention_mask")
print(len(padded_sequences["attention_mask"][0]))
print(len(padded_sequences["attention_mask"][1]))
print(padded_sequences["attention_mask"][0])
print(padded_sequences["attention_mask"][1])

# this is what happens for block_size=5 in LikeByLineTextDataset
padded_sequences = tokenizer.batch_encode_plus([sentence1, sentence2],max_length=tokenizer.model_max_length,
                                               padding='max_length', truncation=True, add_special_tokens=True)
print("input_ids")
print(padded_sequences["input_ids"])
print("token_type_ids")
print(padded_sequences["token_type_ids"])
print("attention_mask")
print(padded_sequences["attention_mask"])


input_ids
61
61
[2, 56, 33, 35, 34, 13, 11, 24, 24, 12, 63, 49, 63, 60, 11, 11, 30, 22, 11, 55, 63, 11, 43, 24, 35, 16, 13, 13, 52, 34, 52, 56, 33, 34, 21, 35, 33, 11, 43, 24, 17, 40, 60, 11, 11, 22, 22, 30, 22, 11, 24, 60, 43, 55, 16, 9, 13, 35, 52, 34, 3]
[2, 14, 32, 61, 23, 23, 42, 54, 23, 38, 42, 42, 27, 27, 39, 39, 39, 27, 27, 27, 20, 20, 54, 54, 21, 21, 13, 13, 14, 61, 64, 14, 32, 61, 23, 54, 40, 40, 65, 54, 21, 40, 42, 42, 40, 39, 39, 27, 27, 27, 27, 54, 54, 42, 21, 38, 13, 21, 14, 35, 3]
token_type_ids
61
61
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask
61
61
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

### 4. Prepare dataset for BERT

In [5]:
max_sentence_length=lipcot_model['model_max_length']-2
all_sentence_lengths=[max_sentence_length]
train_dataset = BertDataset(train_text, train_label,tokenizer,max_length=max_sentence_length+2)
val_dataset = BertDataset(val_text, val_label,tokenizer,max_length=max_sentence_length+2)
test_dataset = BertDataset(test_text, test_label,tokenizer,max_length=max_sentence_length+2)

Let's check the attention mask

In [6]:
gg=train_dataset.__getitem__(1)
print(gg['attention_mask'])
print(gg['attention_mask'].shape)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
torch.Size([128])


### 5. Save the datasets

In [7]:
joblib.dump(train_dataset, os.path.join(bert_datapath,"train_data.joblib"))
joblib.dump(val_dataset, os.path.join(bert_datapath,"val_data.joblib"))
joblib.dump(test_dataset, os.path.join(bert_datapath,"test_data.joblib"))

['e:\\Research\\LiPCoT\\Code and Data\\LiPCoT\\data/forBert\\test_data.joblib']