In [9]:
import os
from pathlib import Path
import torch
import re
import random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import tqdm
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam

MAX_LEN = 64

### loading all data into memory
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, 'r', encoding='iso-8859-1') as c:
    conv = c.readlines()
with open(corpus_movie_lines, 'r', encoding='iso-8859-1') as l:
    lines = l.readlines()

### splitting text using special lines
lines_dic = {}
for line in lines:
    objects = line.split(" +++$+++ ")
    lines_dic[objects[0]] = objects[-1]

### generate question answer pairs
pairs = []
for con in conv:
    ids = eval(con.split(" +++$+++ ")[-1])
    for i in range(len(ids)):
        qa_pairs = []
        
        if i == len(ids) - 1:
            break

        first = lines_dic[ids[i]].strip()  
        second = lines_dic[ids[i+1]].strip() 

        qa_pairs.append(' '.join(first.split()[:MAX_LEN]))
        qa_pairs.append(' '.join(second.split()[:MAX_LEN]))
        pairs.append(qa_pairs)

In [6]:
print(len(pairs))
pairs[20]


221616


["I really, really, really wanna go, but I can't. Not unless my sister goes.",
 "I'm workin' on it. But she doesn't seem to be goin' for him."]

In [7]:
# WordPiece tokenizer

### save data as txt file
text_data = []
file_count = 0

for sample in tqdm.tqdm([x[0] for x in pairs]):
    text_data.append(sample)

    # once we hit the 10K mark, save to file
    if len(text_data) == 10000:
        with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1

paths = [str(x) for x in Path('./data').glob('**/*.txt')]

  0%|          | 0/221616 [00:00<?, ?it/s]

100%|██████████| 221616/221616 [00:00<00:00, 1719813.64it/s]


In [8]:
print(len(paths))
paths

22


['data/text_21.txt',
 'data/text_8.txt',
 'data/text_4.txt',
 'data/text_16.txt',
 'data/text_11.txt',
 'data/text_20.txt',
 'data/text_7.txt',
 'data/text_12.txt',
 'data/text_2.txt',
 'data/text_1.txt',
 'data/text_13.txt',
 'data/text_3.txt',
 'data/text_17.txt',
 'data/text_6.txt',
 'data/text_19.txt',
 'data/text_18.txt',
 'data/text_0.txt',
 'data/text_10.txt',
 'data/text_9.txt',
 'data/text_15.txt',
 'data/text_14.txt',
 'data/text_5.txt']

In [9]:
### training own tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=True
)

tokenizer.train( 
    files=paths,
    vocab_size=30_000, 
    min_frequency=5,
    limit_alphabet=1000, 
    wordpieces_prefix='##',
    special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']
    )

tokenizer.save_model('./bert-it-1', 'bert-it')
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)








In [10]:
tokenizer.vocab_size

21160

In [11]:
ids = tokenizer.encode('Sometimes you cant learn everything from a screen.')
print(ids)
tokenizer.decode(ids)

[1, 1278, 146, 6116, 1685, 661, 355, 40, 5026, 17, 2]


2024-05-15 19:24:12.910268: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'[CLS] sometimes you cant learn everything from a screen. [SEP]'

In [40]:
def random_word(tokenizer, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):

            print(token)

            # remove cls and sep token
            token_id = tokenizer(token)['input_ids'][1:-1]
            print(token_id)
random_word(tokenizer, 'Sometimes you cant learn everything from a screen.')

Sometimes
[1278]
you
[146]
cant
[6117]
learn
[1685]
everything
[661]
from
[355]
a
[40]
screen.
[5026, 17]


In [46]:
tokenizer('Sometimes.')

{'input_ids': [1, 1278, 17, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [45]:
tokenizer.encode('Sometimes.')

[1, 1278, 17, 2]

In [48]:
tokenizer.tokenize('Sometimes.')

['sometimes', '.']

In [49]:
type(tokenizer)

transformers.models.bert.tokenization_bert.BertTokenizer

In [7]:
from torch.utils.data import Dataset, DataLoader
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer: BertTokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, id):

        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(id)

        # Step 2: replace random words in sentence with mask / random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}
    
    def random_word(self, sentence):
        tokens = self.tokenizer.tokenize(sentence)
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer.vocab[token]

            if prob < 0.15:
                
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                output_label.append(0)

        assert len(output) == len(output_label)
        return output, output_label
    
    def get_sent(self, index):
        '''return random sentence pair'''
        t1, t2 = self.get_corpus_line(index)

        # negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''return sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''return random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]


In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
train_data = BERTDataset(pairs, seq_len=MAX_LEN, tokenizer=tokenizer)

In [11]:
train_data.__getitem__(0)

Can we make this quick? Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.
can
230
we
184
make
432
this
208
quick
1712
?
34
ro
529
##x
116
##anne
13336
kor
8254
##rine
11001
and
179
andrew
5189
barrett
12824
are
234
having
1135
an
160
incredibly
6991
horr
4996
##endo
19495
##us
313
public
1993
break
1023
-
16
up
275
on
192
the
150
quad
8110
.
17
again
542
.
17
Well, I thought we'd start with pronunciation, if that's okay with you.
well
303
,
15
i
48
thought
515
we
184
'
11
d
43
start
672
with
231
pron
15149
##un
295
##cia
7732
##t
93
##ion
242
,
15
if
270
that
173
'
11
s
58
okay
459
with
231
you
146
.
17


{'bert_input': tensor([    1,   230,   184,   432,   208,  1712,    34,   529,   116, 13336,
          8254, 11001,   179,  5189, 12824,   234,  1135,   160,  6991,     3,
         19495,   313,  1993,  1023,    16,   275,   192,   150,  8110,    17,
           542,    17,     2,   303,    15,     3,   515,   184,    11,     3,
             3, 19963, 15149,  9347,  7732,    93,     3,    15,   270,   173,
            11,    58,   459,   231,   146,    17,     2,     0,     0,     0,
             0,     0,     0,     0]),
 'bert_label': tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0, 13336,
             0,     0,     0,     0,     0,     0,     0,     0,     0,  4996,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,    48,     0,     0,     0,    43,
           672,   231,     0,   295,     0,     0,   242,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

In [None]:
train_loader = DataLoader(
   train_data, batch_size=32, shuffle=True, pin_memory=True)