-
Notifications
You must be signed in to change notification settings - Fork 0
/
bpe.py
29 lines (22 loc) · 838 Bytes
/
bpe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import os, time
# path = "corpuses"
path = "linecorpus"
files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".txt")]
files = files[:10]
# files = [f"wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
print(len(files))
print(files[:10])
t1 = time.time()
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train(files, trainer)
tokenizer.save("tokenizer-wiki.json")
# tokenizer.save("path-tokenizer-l.json")
t2 = time.time()
print(f"Time(s) elapsed: {t2 - t1}")
# tokenizer = Tokenizer.from_file("path-tokenizer.json")