Import all libraries

In [1]:
import os;
import re;
import gc;
import sys;
import struct;
import argparse;
import numpy as np;
import pandas as pd;
from tqdm import tqdm;
from typing import List;
import sentencepiece as spm;
from sentencepiece import SentencePieceProcessor;

Hyperparameters

In [2]:
DATASET_FNAME = "arXiv.txt";
PRETOKENIZED_DATASET_FNAME = "arXiv.bin";
VOCAB_SIZE = 2048;

Load dataset as Pandas DataFrame

In [3]:
df = pd.read_json('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', lines = True);

Print dataframe shape

In [4]:
print(df.shape)

(2754926, 14)


Select computer science only entries

In [5]:
# Following https://arxiv.org/category_taxonomy
categories = ['cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY']

# Create mask
mask = df['categories'].str.contains('|'.join(categories), case=False, na=False)

# Save only CS articles in DataFrame
df = df[mask]

Print the new dataframe shape

In [6]:
print(df.shape)

(824576, 14)


Remove all unused columns

In [7]:
# We need 'title' and 'abstract' columns only
df = df[['title', 'abstract']]

Print the final dataframe shape

In [8]:
print(df.shape)

(824576, 2)


Save data as a simple text with format title \<sep\> abstract \n

In [9]:
with open(DATASET_FNAME, "w", encoding="utf-8") as of:
    for row in df.itertuples(index=False):
        of.write(re.sub(r'\s+', ' ', re.sub(r'[^\w \t\'\"\(\)\{\}\[\]\<\>\.\,\;\:\!\?\+\-\*\/\=\^\%\&\|\@\#\$\~\`\@\$\_]', ' ', row.title)).strip() +\
                 "<sep>" +\
                 re.sub(r'\s+', ' ',re.sub(r'[^\w \t\'\"\(\)\{\}\[\]\<\>\.\,\;\:\!\?\+\-\*\/\=\^\%\&\|\@\#\$\~\`\@\$\_]', ' ', row.abstract)).strip() +\
                 "\n")
print(f"Size is: {os.path.getsize(DATASET_FNAME) / 1024 / 1024:.2f} MB")

Size is: 971.25 MB


Delete dataframe and other temporary variables

In [10]:
del df
del categories
del mask
gc.collect()

33

Train BPE tokenizer

In [11]:
spm.SentencePieceTrainer.train(input=DATASET_FNAME,
                               model_prefix=f"tok{VOCAB_SIZE}",
                               model_type="bpe",
                               vocab_size=VOCAB_SIZE,
                               self_test_sample_size=0,
                               input_format="text",
                               character_coverage=1.0,
                               num_threads=os.cpu_count(),
                               split_digits=True,
                               allow_whitespace_only_pieces=True,
                               byte_fallback=True,
                               unk_surface=r" \342\201\207 ",
                               normalization_rule_name="identity",
                               user_defined_symbols='<sep>')

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: arXiv.txt
  input_format: text
  model_prefix: tok2048
  model_type: BPE
  vocab_size: 2048
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 4
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  user_defined_symbols: <sep>
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  \342\201\207 
  enable_

Tokenizer class for dataset pretokenization

In [12]:
class Tokenizer:

    def __init__(self, tokenizer_model):
        assert tokenizer_model
        model_path = tokenizer_model
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        self.model_path = model_path

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)

cc(425) LOG(INFO) Adding meta_piece: <0x70>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x71>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x72>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x73>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x74>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x75>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x76>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x77>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x78>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x79>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7A>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7B>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7C>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7D>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7E>
trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <0x7F>
trainer_interface.cc(425) 

Pretokenize all dataset

In [13]:
enc = Tokenizer(f"tok{VOCAB_SIZE}.model")
all_tokens = []
with open(DATASET_FNAME, "r", encoding="utf-8") as f:
    for line in tqdm(f):
        tokens = enc.encode(line, bos=True, eos=True)  # encode the text, use BOS
        all_tokens.extend(tokens)
# convert to uint16 nparray
all_tokens = np.array(all_tokens, dtype=np.uint16)
# write the bytes
with open(PRETOKENIZED_DATASET_FNAME, "wb") as f:
    f.write(all_tokens.tobytes())
# calculate the average sequence length (they are separated by BOS=1)
avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
print(f"Saved {PRETOKENIZED_DATASET_FNAME}, average seqlen: {avg_seq_len:.2f}")

824576it [20:32, 669.21it/s] 


Saved arXiv.bin, average seqlen: 357.92
