In [8]:
import pandas as pd
from pathlib import Path

In [9]:

paths = [str(x) for x in Path("./data_eng/").glob("**/*.txt")]
print(paths)

['data_eng\\eng_news_2016_1M-co_n.txt', 'data_eng\\eng_news_2016_1M-co_s.txt', 'data_eng\\eng_news_2016_1M-inv_so.txt', 'data_eng\\eng_news_2016_1M-inv_w.txt', 'data_eng\\eng_news_2016_1M-sentences.txt', 'data_eng\\eng_news_2016_1M-sources.txt', 'data_eng\\eng_news_2016_1M-words.txt']


In [10]:
dataframe = []
for i in range(1, len(paths)):
    df = pd.read_csv(paths[i], sep = "\t", header = None)
    dataframe.append(df)


In [14]:
overall_df = pd.concat(dataframe)
overall_df = overall_df[1]
print(overall_df)

overall_df.to_csv('new_data_eng.txt', header=None, index=None, mode='a')

0                            5177
1                            6710
2                            9894
3                           22004
4                             343
5                           35391
6                            1333
7                            1649
8                            6032
9                           31782
10                          11949
11                          19031
12                          18144
13                           1842
14                          11157
15                           1606
16                         128631
17                             12
18                            107
19                            982
20                            489
21                            124
22                           2853
23                            428
24                           2989
25                           2250
26                           1738
27                          33338
28                           1665
29            

  """


In [15]:
overall_df.head()

0     5177
1     6710
2     9894
3    22004
4      343
Name: 1, dtype: object

In [12]:
data = data2 = "" 
  
# # Reading data from file1 
with open('new_data_eng.txt',encoding='utf-8') as fp: 
    data = fp.read() 

### Training BPE Tokeniser
Here, the goal is to build two files: vocab.json and merges.txt. The vocab.json is a list of the top K tokens found in the text corpus that you built in the previous step map to their respective token ids. Since we are building a byte pair encoding (BPE) tokeniser, the merges.txt allows us to perform subword tokenisation on our input text. We would be using the tokenizer from the Hugging Face library as shown below:

In [19]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

data = 'new_data_eng.txt'

In [20]:
# Initialize a tokenizer
tokenizer1 = ByteLevelBPETokenizer(dropout=0.1)

# Customize training
tokenizer1.train(files=data, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])


In [21]:
# Save files to disk
tokenizer1.save("./english")

['./english\\vocab.json', './english\\merges.txt']

In [3]:
import json
import tokenizers

In [4]:

bpe_tokeniser_from_files = tokenizers.ByteLevelBPETokenizer(
    vocab_file='english/vocab.json',
    merges_file='english/merges.txt',
    dropout=0.1,
)

print('dropout =', bpe_tokeniser_from_files._parameters['dropout'])
for _ in range(10):
    print(bpe_tokeniser_from_files.encode('There are a lot of apples in the garden').tokens)

dropout = 0.1
['There', 'Ġare', 'Ġa', 'Ġlot', 'Ġof', 'Ġapples', 'Ġin', 'Ġt', 'he', 'Ġgarden']
['There', 'Ġare', 'Ġa', 'Ġlot', 'Ġof', 'Ġapples', 'Ġin', 'Ġthe', 'Ġgard', 'en']
['There', 'Ġare', 'Ġa', 'Ġlot', 'Ġof', 'Ġap', 'ples', 'Ġin', 'Ġth', 'e', 'Ġgarden']
['There', 'Ġare', 'Ġa', 'Ġlo', 't', 'Ġof', 'Ġapples', 'Ġin', 'Ġthe', 'Ġgarden']
['Th', 'ere', 'Ġare', 'Ġa', 'Ġlot', 'Ġof', 'Ġapples', 'Ġin', 'Ġthe', 'Ġgarden']
['Th', 'ere', 'Ġar', 'e', 'Ġa', 'Ġ', 'lot', 'Ġof', 'Ġap', 'ples', 'Ġin', 'Ġth', 'e', 'Ġg', 'arden']
['There', 'Ġare', 'Ġ', 'a', 'Ġ', 'lot', 'Ġof', 'Ġapples', 'Ġin', 'Ġthe', 'Ġg', 'arden']
['There', 'Ġare', 'Ġa', 'Ġl', 'ot', 'Ġof', 'Ġapples', 'Ġin', 'Ġthe', 'Ġgarden']
['T', 'her', 'e', 'Ġare', 'Ġa', 'Ġlot', 'Ġo', 'f', 'Ġapples', 'Ġi', 'n', 'Ġthe', 'Ġgarden']
['There', 'Ġare', 'Ġa', 'Ġlot', 'Ġ', 'of', 'Ġap', 'ples', 'Ġin', 'Ġthe', 'Ġgarden']


In [5]:
output = bpe_tokeniser_from_files.encode("There are a lot of apples in the garden and we are glad.")
print(output.ids, output.tokens, output.offsets)        

[1997, 531, 265, 434, 395, 361, 32370, 225, 263, 262, 264, 9411, 364, 626, 531, 404, 16017, 18] ['There', 'Ġare', 'Ġa', 'Ġl', 'ot', 'Ġof', 'Ġapples', 'Ġ', 'in', 'Ġt', 'he', 'Ġgarden', 'Ġand', 'Ġwe', 'Ġare', 'Ġg', 'lad', '.'] [(0, 5), (5, 9), (9, 11), (11, 13), (13, 15), (15, 18), (18, 25), (25, 26), (26, 28), (28, 30), (30, 32), (32, 39), (39, 43), (43, 46), (46, 50), (50, 52), (52, 55), (55, 56)]


In [7]:
output

Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])