### Extract 1000 documents randomly

In [None]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from utils import extract_json
from pathlib import Path

TRAIN_DIR = Path("../data/mex2/train")
ALL_SECTIONS_CSV = Path("../cache/all_sections.csv")
SAMPLED_SECTIONS_CSV = Path("../cache/sampled_1000_sections.csv")

df_all = extract_json.load_or_create_all_sections(TRAIN_DIR)
print(f"[INFO] All sections: {len(df_all):,} rows | Saved to {ALL_SECTIONS_CSV}")

df_sample = extract_json.load_or_sample_1000(df_all)
df_sample.to_csv(extract_json.SAMPLED_SECTIONS_CSV, index=False)
print(f"[INFO] Sampled sections: {len(df_sample):,} rows | Saved to {SAMPLED_SECTIONS_CSV}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131863 entries, 0 to 131862
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   file_name      131863 non-null  object
 1   section_index  131863 non-null  int64 
 2   section_title  131863 non-null  object
 3   text           131863 non-null  object
 4   section_id     131863 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 131863 entries, 0 to 131862
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   file_name      131863 non-null  object
 1   section_index  131863 non-null  int64 
 2   section_title  131863 non-null  object
 3   text           131863 non-null  object
 4   section_id     131863 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB
None
[INFO] All sections: 131,863 rows | Saved to ../cache/all_s

### Baseline

In [6]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
import pandas as pd

working_dir = Path("../cache/sampled_1000_sections.csv")
working_df = pd.read_csv(working_dir)
working_df['baseline_tokenized'] = working_df['text'].apply(lambda x: tokenizer.tokenize(x))
working_df['baseline_token_count'] = working_df['baseline_tokenized'].apply(len)

In [23]:
working_df.head()

Unnamed: 0,file_name,section_index,section_title,text,section_id,baseline_tokenized,baseline_token_count
0,000efc17-13d8-433d-8f62-a3932fe4f3b8.json,7,Clinical vascular risk factors associated with...,Bivariate analyses and regression modeling wer...,000efc17-13d8-433d-8f62-a3932fe4f3b8.json:7,"[bi, ##var, ##iate, analyses, and, regression,...",379
1,002203f0-1c57-4400-abc1-b783c4085743.json,4,B. Proposed Hybrid Geometric-Statistical Feature,We propose to transform the feature function g...,002203f0-1c57-4400-abc1-b783c4085743.json:4,"[we, propose, to, transform, the, feature, fun...",891
2,0093af75-9166-4b57-bace-36389b77bab0.json,12,ADVANTAGES,The first and foremost advantage of this porta...,0093af75-9166-4b57-bace-36389b77bab0.json:12,"[the, first, and, foremost, advantage, of, thi...",396
3,00c01a25-ef21-4819-8232-7501c0d5f8a9.json,7,Data Analysis,All statistical analyses were conducted using ...,00c01a25-ef21-4819-8232-7501c0d5f8a9.json:7,"[all, statistical, analyses, were, conducted, ...",211
4,00c79aeb-a106-4206-914b-d93008ec00be.json,2,Methods,The sample was drawn from the Wisconsin Longit...,00c79aeb-a106-4206-914b-d93008ec00be.json:2,"[the, sample, was, drawn, from, the, wisconsin...",367


### Own Tokenizer (Bottom-up Approach)

In [24]:
from utils import own_bpe

corpus = working_df['text'].tolist()
merges = own_bpe.train_bpe(corpus, k=5000, min_freq=2)

working_df['own_bpe_tokenized'] = working_df['text'].apply(lambda s: own_bpe.bpe_tokenize(s, merges))
working_df['own_bpe_token_count'] = working_df['own_bpe_tokenized'].apply(len)


In [29]:
working_df['difference'] = working_df['own_bpe_token_count'] - working_df['baseline_token_count']
working_df['difference_percent'] = working_df['difference'] / working_df['baseline_token_count'] * 100

print(working_df[['baseline_token_count', 'own_bpe_token_count', 'difference', 'difference_percent']].to_markdown())
print(f"Average difference: {working_df['difference'].mean():.2f} tokens ({working_df['difference_percent'].mean():.2f}%)")

|     |   baseline_token_count |   own_bpe_token_count |   difference |   difference_percent |
|----:|-----------------------:|----------------------:|-------------:|---------------------:|
|   0 |                    379 |                   397 |           18 |             4.74934  |
|   1 |                    891 |                   993 |          102 |            11.4478   |
|   2 |                    396 |                   448 |           52 |            13.1313   |
|   3 |                    211 |                   225 |           14 |             6.63507  |
|   4 |                    367 |                   405 |           38 |            10.3542   |
|   5 |                     37 |                    42 |            5 |            13.5135   |
|   6 |                   1014 |                   997 |          -17 |            -1.67653  |
|   7 |                    249 |                   278 |           29 |            11.6466   |
|   8 |                    732 |                  

The difference in the number of tokens between the two methods shows an **average gap of 53.68 tokens, or about 10.91%**.

Since **BERT’s `bert-base-uncased` WordPiece tokenizer** was trained on a large corpus (≈30k vocabulary), it is better at generalizing token boundaries. In contrast, the **custom BPE tokenizer** was trained only on 1000 sampled texts and then applied directly on the same data.

* **WordPiece** produces fewer tokens than the custom BPE because it balances **frequency with probability** when selecting merges.
* **BPE** only considers the **most frequent adjacent pairs** in the training data.
* In my experiments with `k`, a **smaller k** led to more tokens (closer to character-level), while a **larger k** resulted in fewer tokens (closer to whole words).



### Sentence Segmentation

In [42]:
import nltk
nltk.download('punkt')
nltk.download("punkt_tab")
from nltk.tokenize import sent_tokenize

working_df['sentences'] = working_df['text'].apply(lambda s: sent_tokenize(s))
all_sentences = [sent for sents in working_df['sentences'] for sent in sents]

corpus = all_sentences.copy()
merges = own_bpe.train_bpe(all_sentences, k=5000, min_freq=2)

working_df['own_bpe_tokens_per_sentence'] = working_df['sentences'].apply(
    lambda sents: [own_bpe.bpe_tokenize(sent, merges) for sent in sents]
)
working_df['own_bpe_total_tokens'] = working_df['own_bpe_tokens_per_sentence'].apply(
    lambda tok_lists: sum(len(toks) for toks in tok_lists)
)

working_df['sentence_own_bpe_difference'] = working_df['own_bpe_total_tokens'] - working_df['own_bpe_token_count']
working_df['sentence_own_bpe_difference_percent'] = (working_df['sentence_own_bpe_difference'] / working_df['own_bpe_token_count']) * 100

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [43]:
print(working_df[['own_bpe_token_count', 'own_bpe_total_tokens', 'sentence_own_bpe_difference', 'difference_percent']].to_markdown())

|     |   own_bpe_token_count |   own_bpe_total_tokens |   sentence_own_bpe_difference |   difference_percent |
|----:|----------------------:|-----------------------:|------------------------------:|---------------------:|
|   0 |                   397 |                    397 |                             0 |             4.74934  |
|   1 |                   993 |                    993 |                             0 |            11.4478   |
|   2 |                   448 |                    448 |                             0 |            13.1313   |
|   3 |                   225 |                    225 |                             0 |             6.63507  |
|   4 |                   405 |                    405 |                             0 |            10.3542   |
|   5 |                    42 |                     42 |                             0 |            13.5135   |
|   6 |                   997 |                    997 |                             0 |            -1.6

Retraining it with sentence segmentation resulted in no difference at all.

### Sources
1. https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/
2. https://aclanthology.org/2024.findings-emnlp.860/
3. https://huggingface.co/google-bert/bert-base-uncased