# BERT Tokenizer Training
Copyright (C) 2021 ServiceNow, Inc.

Notebook space for testing for a tokenizer with geo domain data.  

Notebook can be run from top to bottom, but sections also generally run independently.

In [1]:
%load_ext autoreload
%autoreload 2

## Test script for tokenizer training

Add the project directory to the path to allow for importing modules from the nrcan directory. 

On toolkit, this should print something like `/nrcan_p2/workspace/<USERNAME>/nrcan_p2`, where the key piece is the last `/nrcan_p2` folder.

In [2]:
import pathlib
REPO_DIR = pathlib.Path(__name__).parent.absolute().parent.parent
print(REPO_DIR)

import sys
sys.path.append(str(REPO_DIR))

/nrcan_p2/workspace/lindsay/nrcan_p2


In [3]:
from nrcan_p2.tokenization.custom_tokenizer import (train_WordPiece)

In [4]:
# train_WordPiece(input_files = '/nrcan_p2/data/03_primary/v1/short_text_5M.txt', 
#                 save_path = '/nrcan_p2/data/06_models/tokenizers/geo_trained/')

The above cell is commented out because (a) the input file doesn't exist anymore, and (b) this was for testing, and this action is now done through a separate script. If it were run, it would output the following:
```
Argument `input_files` given as string; converting to list.
Input file exists: /nrcan_p2/data/03_primary/v1/short_text_5M.txt
Output directory exists: /nrcan_p2/data/06_models/tokenizers/geo_trained/
Model will be saved with filenamewordpiece_geo_short_text_5M_2021-02-03_14:46:34
```

## Scratchpad for developing tokenizer training script

This section includes logic used in development of `run_tokenizer_training.py` and `custom_tokenizer.py`.

In [5]:
import os
import re

In [6]:
from datetime import datetime
datetime.now(tz=None).strftime("%Y-%m-%d_%H:%M:%S")

'2021-03-12_14:37:11'

In [7]:
"wordpiece_geo_" + \
        os.path.splitext('safd/asfad/thispart.txt'.rsplit(sep='/', maxsplit=1)[-1])[0]

'wordpiece_geo_thispart'

In [8]:
string = "/nrcan_p2/data/06_models/tokenizers/geo_trained/"

import sys
if os.path.isdir(string):
    print('Output directory exists: ' + string)
else:
    print('Output directory does not exist: ' + string)
    sys.exit('Output directory not found: ' + string)

Output directory exists: /nrcan_p2/data/06_models/tokenizers/geo_trained/


In [9]:
dir_name = "/nrcan_p2/data/06_models/tokenizers/geo_trained/"
base_filename = 'tokenizername'

os.path.join(dir_name, base_filename + "_otherinfo" + '.json')

'/nrcan_p2/data/06_models/tokenizers/geo_trained/tokenizername_otherinfo.json'

In [10]:
input_files = "/nrcan_p2/data/06_models/tokenizers/geo_trained/"

if not(isinstance(input_files, list)):
    print('Please give input_files as list.')

Please give input_files as list.


In [11]:
input_files = "/nrcan_p2/data/06_models/tokenizers/geo_trained/"

if isinstance(input_files, str):
    input_files = [input_files]

print(input_files)
isinstance(input_files, list)

['/nrcan_p2/data/06_models/tokenizers/geo_trained/']


True

In [12]:
input_files = ["/nrcan_p2/data/04_feature/v4_B/all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dB_v1/train.txt"]
# input_files = ["/nrcan_p2/data/03_primary/v1/short_text_5M.txt"]

filename_base = os.path.splitext(input_files[0].rsplit(sep='/', maxsplit=1)[-1])[0]
if filename_base == 'train':
    filename_base = os.path.splitext(input_files[0].rsplit(sep='/', maxsplit=2)[-2])[0]
filename_base

'all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dB_v1'

In [13]:
print(f"Training starting at: {datetime.now(tz=None).strftime('%Y-%m-%d %H:%M:%S')}")

Training starting at: 2021-03-12 14:37:12


## Add new tokens to pretrained tokenizer using `add_tokens`, and examine behaviour (spoiler alert: it's inconsistent)

In [14]:
from transformers import BertTokenizer
from tokenizers import Tokenizer, AddedToken
from transformers import AutoModel

In [15]:
# Read in standard tokenizer
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [16]:
# Sidenote: Do BERT and DistilBERT tokenizers have the same vocabulary?
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenizer.vocab.keys() == tokenizer_bert.vocab.keys()

True

In [17]:
# Check for some tokens in tokenizer
for my_token in ["geology", 
                 "##ozoic", 
                 "meso", 
                 "america",
                 "paleo", 
                 "ization", "##ization", 
                 "##tation", 
                 "##ceous",
                 "paleontologist"]:
    print(f'"{my_token}": {my_token in tokenizer.vocab.keys() | tokenizer.added_tokens_encoder.keys()}')

"geology": True
"##ozoic": False
"meso": False
"america": True
"paleo": False
"ization": False
"##ization": True
"##tation": True
"##ceous": False
"paleontologist": False


In [18]:
tokenizer.tokenize("paleozoic mesoamerica meamerica crustaceous paleontologist")

['pale',
 '##oz',
 '##oic',
 'me',
 '##so',
 '##ame',
 '##rica',
 'me',
 '##ame',
 '##rica',
 'crust',
 '##aceous',
 'pale',
 '##ont',
 '##ologist']

In [19]:
len(tokenizer)

30522

In [20]:
# Add new tokens to tokenizer
new_vocab = [AddedToken("ceous", single_word=False), 
             AddedToken("meso", single_word=False), 
             AddedToken("paleo", single_word=False),
             AddedToken("zoic", single_word=False),
             AddedToken("paleontologist", single_word=True)]
tokenizer.add_tokens(new_vocab)

5

In [21]:
len(tokenizer)

30527

In [22]:
# Check for some tokens in tokenizer
for my_token in ["geology", 
                 "##ozoic", 
                 "meso", 
                 "america",
                 "paleo", 
                 "ization", "##ization", 
                 "##tation", 
                 "##ceous",
                 "paleontologist"]:
    print(f'"{my_token}": {my_token in tokenizer.vocab.keys()| tokenizer.added_tokens_encoder.keys()}')

"geology": True
"##ozoic": False
"meso": True
"america": True
"paleo": True
"ization": False
"##ization": True
"##tation": True
"##ceous": False
"paleontologist": True


In [23]:
tokenizer.tokenize("paleozoic mesoamerica meamerica crustaceous paleontologist")

['paleo',
 'zoic',
 'meso',
 'america',
 'me',
 '##ame',
 '##rica',
 'crust',
 '##a',
 'ceous',
 'paleontologist']

In [24]:
tokenizer.tokenize("mesoamerica meamerica mesoamerica")

['meso', 'america', 'me', '##ame', '##rica', 'meso', 'america']

In [25]:
print(tokenizer.tokenize("mesoamerica"))
print(tokenizer.tokenize("tableamerica"))
print(tokenizer.tokenize("america"))

['meso', 'america']
['table', '##ame', '##rica']
['america']


* After an added token, the rest of the word is treated as a separate whole word if possible (greedy).
* After a regular token that is a whole word, the rest of the word is tokenized into sub-words.

For posterity, to demonstrate, the cell above:
```
print(tokenizer.tokenize("mesoamerica"))
print(tokenizer.tokenize("tableamerica"))
print(tokenizer.tokenize("america"))
```

gives the following output:

```
['meso', 'america']
['table', '##ame', '##rica']
['america']
```

where `meso` is an added token, and `table` was present in the pretrained tokenizer.

## Code for saving and re-reading tokenizer, with original and modified `vocab.txt`

This approach leads to expected tokenizer behaviour, in contrast with the `add_tokens` method.

In [26]:
from transformers import BertTokenizer
import os

In [27]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [28]:
tokenizer.save_pretrained('/nrcan_p2/data/06_models/tokenizers/testing/generic/')

('/nrcan_p2/data/06_models/tokenizers/testing/generic/tokenizer_config.json',
 '/nrcan_p2/data/06_models/tokenizers/testing/generic/special_tokens_map.json',
 '/nrcan_p2/data/06_models/tokenizers/testing/generic/vocab.txt',
 '/nrcan_p2/data/06_models/tokenizers/testing/generic/added_tokens.json')

In [29]:
tokenizer_read = BertTokenizer.from_pretrained('/nrcan_p2/data/06_models/tokenizers/testing/generic/')

Test tokenization with original tokenizer (saved and read back in):

In [30]:
# Tokenization with original tokenizer
tokenizer_read.tokenize("paleozoic paleontologist mesoamerica mesozoic america")

['pale',
 '##oz',
 '##oic',
 'pale',
 '##ont',
 '##ologist',
 'me',
 '##so',
 '##ame',
 '##rica',
 'me',
 '##so',
 '##zo',
 '##ic',
 'america']

Copy manually modified vocab to model directory `vocab.txt`:

In [31]:
!cp /nrcan_p2/data/06_models/tokenizers/testing/vocab_lists/vocab_edited.txt /nrcan_p2/data/06_models/tokenizers/testing/generic/vocab.txt 

This added the following tokens to the vocabulary:
- `##zoic`
- `paleontologist`
- `meso`
- `paleo`

Read model in (including modified `vocab.txt`) and test tokenization:

In [32]:
tokenizer_read = BertTokenizer.from_pretrained('/nrcan_p2/data/06_models/tokenizers/testing/generic/')

In [33]:
tokenizer_read.tokenize("paleozoic paleontologist mesoamerica mesozoic america")

['paleo',
 '##zoic',
 'paleontologist',
 'meso',
 '##ame',
 '##rica',
 'meso',
 '##zoic',
 'america']

---

## Geo tokenizer, analysis of new tokens

### Original tokenizer vocab (full datasets)

_The conclusion of this section is that there are too many messy tokens for this to be useful without substantial manual input into selecting new tokens. To address this, we created a new dataset (very clean but much smaller) using document metadata, and used that to train the geology tokenizer._

In [34]:
from transformers import BertTokenizer
import csv
import os
import pandas as pd
#import numpy as np

Get vocab from generic tokenizer:

In [35]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

Standard BERT tokenizer vocab list:

In [36]:
bert = list(tokenizer.vocab.keys())
len(bert)

30522

Get vocab from geo tokenizer:

In [37]:
os.listdir('/nrcan_p2/data/06_models/tokenizers/geo_trained')

['.ipynb_checkpoints',
 'wordpiece_geo_short_text_5M_2021-02-03_21:55:51-vocab.txt',
 'wordpiece_geo_short_text_5M_2021-02-03_21:55:51.json',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dA_full_v1_20210209_004618-vocab.txt',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dA_full_v1_20210209_004618.json',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dB_v1_20210209_004703-vocab.txt',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dB_v1_20210209_004703.json',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dA_full_dB_v1_20210209_004417-vocab.txt',
 'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dA_full_dB_v1_20210209_004417.json',
 'tokens_head.txt',
 'wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710-vocab.txt',
 'wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710.json']

In [38]:
vocab_file = os.path.join('/nrcan_p2/data/06_models/tokenizers/geo_trained', 
                          'wordpiece_geo_all_text_SIMPLE_PIPELINE_BERT_3_POSTPIPE_BERT_SPACY_2_dA_full_v1_20210209_004618-vocab.txt')

In [39]:
geo_df = pd.read_csv(vocab_file, sep="\n", header=None)
geo_df.rename(columns = {0:'token'}, inplace = True)
geo_df

Unnamed: 0,token
0,[UNK]
1,[CLS]
2,[SEP]
3,[PAD]
4,[MASK]
...,...
30517,085
30518,beset
30519,nipis
30520,traditions


In [40]:
print(f'5000th token: {geo_df.iloc[5000,0]}')

5000th token: ##rd


In [41]:
with open(vocab_file) as f:
    temp = f.read()
    geo = temp.splitlines()

In [42]:
print(f'number of tokens: {len(geo)}')
print(f'5000th token: {geo[5000]}')

number of tokens: 30522
5000th token: ##rd


In [43]:
geo_new = list(set(geo).difference(set(bert)))
len(geo_new)

17686

In [44]:
n = 10
for n in list(range(1000,1010)):
    print(f'{geo_df.iloc[n,0]} is new? {geo_df.iloc[n,0] in list(geo_new)}')

km is new? False
howe is new? False
##ott is new? False
limited is new? False
brown is new? False
side is new? False
represent is new? False
many is new? False
##umm is new? True
fo is new? True


In [45]:
geo_df['is_new'] = [geo_df['token'][i] in list(geo_new) for i in range(0,len(geo_df['token']))]

In [46]:
geo_df['sub_word'] = ['##' in str(geo_df['token'][i]) for i in range(0,len(geo_df['token']))]

In [47]:
geo_df[geo_df['is_new']==True]

Unnamed: 0,token,is_new,sub_word
195,wh,True,False
234,canad,True,False
239,##pos,True,True
240,##olog,True,True
254,".,",True,False
...,...,...,...
30516,urss,True,False
30517,085,True,False
30518,beset,True,False
30519,nipis,True,False


I want to check if something is a full English word or not.

I tested `langdetect`, which was not helpful; output is in markdown below to preserve learnings even though the module is not still installed in this container.

In [48]:
# !poetry config virtualenvs.create false; poetry add langdetect

In [49]:
# from langdetect import detect as detect_lang
# from langdetect import detect_langs
# detect_lang(string) <-- should return a string
# detect_langs(string) <-- should return a list of lang objects (lang.prob, lang.lang to get the probability and the language)

In [50]:
# from langdetect import detect as detect_lang
# from langdetect import detect_langs

In [51]:
# for i in range(300,310):
#     print(geo_df['token'][i] + ': ' + detect_lang(geo_df['token'][i]))

Output:
```
au: fr
per: id
##ont: fi
riv: it
inter: nl
which: en
##ence: es
##iz: hr
no: tl
##ost: fi
```

In [52]:
# detect_langs('canada')
# detect_langs('canad')

Output: 
```
[so:0.5714273198131568, es:0.42857250393263935]
```

`langdetect` doesn't work as desired. It predicts which language something is, and sometimes predicts that single English words are other languages.

Further explore new tokens. Note that output is reduced for ease of reading; remove `.head(20)` to see full data frame.

In [53]:
pd.set_option('display.max_rows', None)

In [54]:
geo_df[(geo_df['is_new']==True) & (geo_df['sub_word']==False)].head(20)

Unnamed: 0,token,is_new,sub_word
195,wh,True,False
234,canad,True,False
254,".,",True,False
271,comp,True,False
275,gra,True,False
292,geolog,True,False
294,gre,True,False
296,sou,True,False
303,riv,True,False
316,cont,True,False


In [55]:
geo_df[(geo_df['is_new']==True) & (geo_df['sub_word']==True)].head(20)

Unnamed: 0,token,is_new,sub_word
239,##pos,True,True
240,##olog,True,True
286,##iqu,True,True
331,##posit,True,True
378,##orm,True,True
419,##iment,True,True
450,##erv,True,True
467,##ediment,True,True
500,##ould,True,True
502,##cess,True,True


In [56]:
for i in ['meso', 'paleo', 
          '##aceous', '##ceous', '##ceou', '##eous', 
          '##ozoic', '##zoic', "##ozoi"]:
    print(f'bert: {i in bert}   geo: {i in geo}     {i}')

bert: False   geo: True     meso
bert: False   geo: True     paleo
bert: True   geo: True     ##aceous
bert: False   geo: True     ##ceous
bert: False   geo: False     ##ceou
bert: True   geo: False     ##eous
bert: False   geo: True     ##ozoic
bert: False   geo: False     ##zoic
bert: False   geo: False     ##ozoi


In [57]:
for i in ['meso', 'paleo', 
          'aceous', 'ceous', 'ceou', 'eous', 
          'ozoic', 'zoic', "ozoi"]:
    print(f'bert: {i in bert}   geo: {i in geo}     {i}')

bert: False   geo: True     meso
bert: False   geo: True     paleo
bert: False   geo: True     aceous
bert: False   geo: True     ceous
bert: False   geo: False     ceou
bert: False   geo: False     eous
bert: False   geo: False     ozoic
bert: False   geo: False     zoic
bert: False   geo: False     ozoi


In [58]:
geo_df[geo_df['token'].isin(['aceous', 'ceous', 'ceou', 'eous', 
                             'ozoic', 'zoic', "ozoi",
                             '##aceous', '##ceous', '##ceou', '##eous', 
                             '##ozoic', '##zoic', "##ozoi"])]

Unnamed: 0,token,is_new,sub_word
1148,##ceous,True,True
1162,##ozoic,True,True
2469,##aceous,False,True
20600,ceous,True,False
23916,aceous,True,False


---
### New, cleaner tokenizer vocab (trained on metadata only)

_Tokenizer training on this new dataset produced much cleaner tokens, and this is what we used to create the geology-specific tokenizer._

In [59]:
from transformers import BertTokenizer
import csv
import os
import re
import pandas as pd

Get vocab from generic BERT tokenizer:

In [60]:
tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
bert = list(tokenizer.vocab.keys())
len(bert)

30522

Get vocab from shiny new, clean geo tokenizer (on small data):

In [61]:
[file for file in os.listdir('/nrcan_p2/data/06_models/tokenizers/geo_trained') if 'nosentences' in file]

['wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710-vocab.txt',
 'wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710.json']

In [62]:
vocab_file = os.path.join('/nrcan_p2/data/06_models/tokenizers/geo_trained', 
                          'wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710-vocab.txt')

In [63]:
with open(vocab_file) as f:
    temp = f.read()
    geo = temp.splitlines()

In [64]:
geo_df = pd.DataFrame.from_dict({'token': geo})
geo_df.head(10)

Unnamed: 0,token
0,[UNK]
1,[CLS]
2,[SEP]
3,[PAD]
4,[MASK]
5,!
6,""""
7,#
8,$
9,%


In [65]:
geo_new = list(set(geo).difference(set(bert)))
len(geo_new)

17865

There are 17,865 tokens that are in the new geology tokenizer and not in the original pretrained tokenizer.

In [66]:
geo_df['is_new'] = [geo_df['token'][i] in list(geo_new) for i in range(0,len(geo_df['token']))]
geo_df['sub_word'] = ['##' in str(geo_df['token'][i]) for i in range(0,len(geo_df['token']))]

In [67]:
pd.get_option('display.max_rows')
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_rows', 60)

In [68]:
geo_df[geo_df['is_new']==True].reset_index().iloc[:30,]

Unnamed: 0,index,token,is_new,sub_word
0,83,¯,True,False
1,91,¸,True,False
2,312,dis,True,False
3,316,##olog,True,True
4,318,##pos,True,True
5,320,canad,True,False
6,329,str,True,False
7,363,comp,True,False
8,385,##iment,True,True
9,396,geolog,True,False


In [69]:
geo_new_ordered = list(geo_df[geo_df['is_new']==True].reset_index()['token'])

# Quick sanity check on length
print(len(geo_new_ordered))
print(len(geo_new))

17865
17865


In [70]:
geo_df['rank_new'] = [geo_new_ordered.index(token) if token in geo_new_ordered else '-' for token in geo_df['token']]

Some interesting notes:

- `##iment`, `##olog`, and `geolog` are in the top 10.  
- `depos`, `##ediment`, `terr` are in the next 10.
- `).` and `),` are both present; this is likely the effect of text with citations.  Also, it suggests that punctuation is kept together, which I hadn't thought about.
- Other interesting tokens that have clear geographic roots: 
    - `##ict`
    - `##uct`
    - `glac`
    - `##formation`
    - `##anic`
    - `##arbon`
    - `##netic`
    - `seism`
    - `olution`
    - `##atigraph`
    - `##orph`
- Three in a row, most of the way down: `##maf`, `##mafrost`, `permafrost`

There are 994 `[unusedXX]` tokens (first 999 spots, minus 5 special tokens on lines 1 and 101-104: `[PAD]`, `[UNK]`, `[CLS]`, `[SEP]`, `[MASK]`).

I'm considering keeping 500 or 994 tokens.

What are the top 994 geology tokens?

In [71]:
geo_df[geo_df['is_new']==True].reset_index(drop=False).iloc[:30,]

Unnamed: 0,index,token,is_new,sub_word,rank_new
0,83,¯,True,False,0
1,91,¸,True,False,1
2,312,dis,True,False,2
3,316,##olog,True,True,3
4,318,##pos,True,True,4
5,320,canad,True,False,5
6,329,str,True,False,6
7,363,comp,True,False,7
8,385,##iment,True,True,8
9,396,geolog,True,False,9


Follow the merges for some tokens to better understand the process:

- `geolog`
- `mesozoic` +/- `paleozoic`

In [72]:
def find_merges(my_str):
    tokens = [i for i in geo if re.sub('##', '', i) in my_str]
    indices = [geo.index(token) for token in tokens]
    rank_new = [geo_new_ordered.index(token) if token in geo_new_ordered else '-' for token in tokens]
    merges = pd.DataFrame.from_dict({'index': indices, 
                                     'rank_new': rank_new, 
                                     'token': tokens})
    merges['is_new'] = [['-', 'new'][int(token in list(geo_new_ordered))] for token in tokens]
    merges['sub_word'] = [['-', '##'][int('##' in token)] for token in tokens]
    return(merges)

In [73]:
geo_new_ordered.index('geolog')

9

In [74]:
find_merges('geology')

Unnamed: 0,index,rank_new,token,is_new,sub_word
0,46,-,e,-,-
1,48,-,g,-,-
2,53,-,l,-,-
3,56,-,o,-,-
4,66,-,y,-,-
5,107,-,##o,-,##
6,109,-,##l,-,##
7,113,-,##e,-,##
8,125,-,##g,-,##
9,134,-,##y,-,##


Obervations: 

- Words are not necessarily built up one letter at a time. 
    - This actually makes sense; the process isn't merging single letters to multi-letter tokens, it's just merging tokens, so that can be multi-letter token + multi-letter token.
        - E.g., `ol` + `og` = `olog`, which is new for this vocab list; `log` happens later (and `olog` doesn't happen at all), so it's not `o` + `log`. 
        - FYI, it's like this:
            - `##ol` + `##og` = `olog`
            - `ge` + `##olog` = `geolog`
            - `geolog` + `##y` = `geology`
         - The above happens before even `##ology` exists.
    - So when thinking about branching (e.g., if something like `ozoic` is present in multiple other full words), it's not as simple as adding one letter.
- `geology` does exist in the original vocab list even though certain sub-word tokens do not (`##olog`, `#geolog`, `geol`, etc.), so it must be built up in a different way

In [75]:
find_merges('mesozoic')

Unnamed: 0,index,rank_new,token,is_new,sub_word
0,44,-,c,-,-
1,46,-,e,-,-
2,50,-,i,-,-
3,54,-,m,-,-
4,56,-,o,-,-
5,60,-,s,-,-
6,67,-,z,-,-
7,106,-,##i,-,##
8,107,-,##o,-,##
9,110,-,##m,-,##


In [76]:
find_merges('paleozoic')

Unnamed: 0,index,rank_new,token,is_new,sub_word
0,42,-,a,-,-
1,44,-,c,-,-
2,46,-,e,-,-
3,50,-,i,-,-
4,53,-,l,-,-
5,56,-,o,-,-
6,57,-,p,-,-
7,67,-,z,-,-
8,106,-,##i,-,##
9,107,-,##o,-,##


In [77]:
find_merges('paleozoic mesozoic')

Unnamed: 0,index,rank_new,token,is_new,sub_word
0,42,-,a,-,-
1,44,-,c,-,-
2,46,-,e,-,-
3,50,-,i,-,-
4,53,-,l,-,-
5,54,-,m,-,-
6,56,-,o,-,-
7,57,-,p,-,-
8,60,-,s,-,-
9,67,-,z,-,-


---
### Do substitutions and write vocab.txt files based on the above (cleaner tokens)

This code is in two scripts: `download_save_pretrained_tokenizer.py` and `create_geo_tokenizers.py`.

Code that writes to the operating system has been commented out to prevent making unintentional changes.

In [78]:
from transformers import BertTokenizer
import os

In [79]:
bert_tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [80]:
# bert_tokenizer.save_pretrained('/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/')

The above is commented out to be sure that the files do not change, but when it was first run, it output the following:
```
('/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/tokenizer_config.json',
 '/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/special_tokens_map.json',
 '/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/vocab.txt',
 '/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/added_tokens.json')
 ```

Read in generic bert `vocab.txt`

In [81]:
bert_vocab_file = os.path.join('/nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased', 
                          'vocab.txt')
with open(bert_vocab_file) as f:
    temp = f.read()
    bert = temp.splitlines()

Read in geo tokens:

In [82]:
vocab_file = os.path.join('/nrcan_p2/data/06_models/tokenizers/geo_trained', 
                          'wordpiece_geo_EAIDown.xml_processed_nosentences_20210223_161710-vocab.txt')
with open(vocab_file) as f:
    temp = f.read()
    geo = temp.splitlines()

In [83]:
# Sanity test
len(bert) == len(geo)

True

Determine which tokens are new:

In [84]:
geo_new = list(set(geo).difference(set(bert)))
len(geo_new)

17865

In [85]:
geo_new_ordered = [token for token in geo if token in geo_new]

In [86]:
geo_new_ordered[0:10]

['¯',
 '¸',
 'dis',
 '##olog',
 '##pos',
 'canad',
 'str',
 'comp',
 '##iment',
 'geolog']

Determine indices of `[unusedXX]` tokens in generic tokenizer:

In [87]:
unused_indices = [bert.index(token) for token in bert if '[unused' in token]
len(unused_indices) # Should be 994

994

Function to substitute new geo tokens for `[unused]` tokens:

(_Note: This will break if count >994 (greater than length of `unused_indices`); should write in a check to avoid this error._)

In [88]:
def subst_geo_for_unused(count):
    assert count<=994, f'count must be less than 994'
    vocab = bert.copy()
    for i in range(0,count):
        vocab[unused_indices[i]] = geo_new_ordered[i]
    return vocab

Test above function:

In [89]:
test = subst_geo_for_unused(993)

In [90]:
test[990:1005]

['munic',
 '##istocene',
 'strateg',
 'arg',
 '##epend',
 'hydroge',
 'mineralogy',
 'sn',
 '[unused993]',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&']

In [91]:
bert_geo_250 = subst_geo_for_unused(250)
print(f'length = {len(bert_geo_250)}')
print(bert_geo_250[100], bert[100])
print(bert_geo_250[200], bert[200])
print(bert_geo_250[300], bert[300])
print(bert_geo_250[3000], bert[3000])

length = 30522
[UNK] [UNK]
scoti [unused195]
[unused295] [unused295]
paris paris


Use above function in wrapper function that writes the new `vocab.txt` to the appropriate directory:

In [92]:
def write_new_vocab(count):
    bert_geo_count_vocab = subst_geo_for_unused(count)
    with open(f'/nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_{count}/vocab.txt', 'w') as f:
        for i, token in enumerate(bert_geo_count_vocab):
            if i < (len(bert_geo_count_vocab)-1):
                f.write(f'{token}\n')
            else:
                f.write(f'{token}')

Run this for the selected token counts:

In [93]:
# write_new_vocab(250)
# write_new_vocab(500)
# write_new_vocab(994)

Copy other tokenizer files appropriately. Should be made more programmatic and included in previous function.

`special_tokens_map.json`

In [94]:
# !cp /nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/special_tokens_map.json \
# /nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_250/special_tokens_map.json

In [95]:
# !cp /nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/special_tokens_map.json \
# /nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_500/special_tokens_map.json

`tokenizer_config.json`

In [96]:
# !cp /nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/tokenizer_config.json \
# /nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_250/tokenizer_config.json

In [97]:
# !cp /nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/tokenizer_config.json \
# /nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_500/tokenizer_config.json

In [98]:
count = 300
token_dir = f'/nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_{count}/'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)    
cmd = f'cp /nrcan_p2/data/06_models/tokenizers/distilbert-base-uncased/special_tokens_map.json \
{os.path.join(token_dir, "special_tokens_map.json")}'
# os.system(cmd)

---
# Testing + creating examples/calculations for the report

### Prep

In [1]:
from transformers import BertTokenizer
import os

In [2]:
tokenizer_250 = BertTokenizer.from_pretrained('/nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_250')

In [3]:
tokenizer_500 = BertTokenizer.from_pretrained('/nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_500')

In [4]:
tokenizer_994 = BertTokenizer.from_pretrained('/nrcan_p2/data/06_models/tokenizers/bert_geo/bert_geo_994')

### Test tokenizers

Test tokenization with original tokenizer (saved and read back in):

In [103]:
tokenizer_250.tokenize("paleozoic paleontologist mesoamerica mesozoic america")

['pale',
 '##ozoic',
 'pale',
 '##ont',
 '##ologist',
 'me',
 '##so',
 '##ame',
 '##rica',
 'me',
 '##so',
 '##zo',
 '##ic',
 'america']

In [104]:
tokenizer_500.tokenize("paleozoic paleontologist mesoamerica mesozoic america")

['paleozoic',
 'pale',
 '##ont',
 '##ologist',
 'me',
 '##so',
 '##ame',
 '##rica',
 'me',
 '##so',
 '##zo',
 '##ic',
 'america']

In [105]:
tokenizer_994.tokenize("paleozoic paleontologist mesoamerica mesozoic america")

['paleozoic',
 'paleo',
 '##nto',
 '##logist',
 'mes',
 '##oa',
 '##meric',
 '##a',
 'mesozoic',
 'america']

### Comparisons between tokenizers

Original tokenizer:

In [106]:
bert_tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [107]:
def compare_tokenizers(mystr):
    print(f'\nbert_tokenizer:')
    print(bert_tokenizer.tokenize(mystr))
    print(f'\nbert_geo_250:')
    print(tokenizer_250.tokenize(mystr))
    print(f'\nbert_geo_500:')
    print(tokenizer_500.tokenize(mystr))
    print(f'\nbert_geo_994:')
    print(tokenizer_994.tokenize(mystr))

In [108]:
compare_tokenizers('This geology sentence includes paleozoic, mesozoic, and seismology.')


bert_tokenizer:
['this', 'geology', 'sentence', 'includes', 'pale', '##oz', '##oic', ',', 'me', '##so', '##zo', '##ic', ',', 'and', 'se', '##ism', '##ology', '.']

bert_geo_250:
['this', 'geology', 'sentence', 'includes', 'pale', '##ozoic', ',', 'me', '##so', '##zo', '##ic', ',', 'and', 'seism', '##ology', '.']

bert_geo_500:
['this', 'geology', 'sentence', 'includes', 'paleozoic', ',', 'me', '##so', '##zo', '##ic', ',', 'and', 'seism', '##ology', '.']

bert_geo_994:
['this', 'geology', 'sentence', 'includes', 'paleozoic', ',', 'mesozoic', ',', 'and', 'seism', '##ology', '.']


In [109]:
print(bert_tokenizer.tokenize('The espresso machine added elegance to the quotidian task of morning caffeination.'))

['the', 'es', '##press', '##o', 'machine', 'added', 'elegance', 'to', 'the', 'quo', '##ti', '##dian', 'task', 'of', 'morning', 'caf', '##fe', '##ination', '.']


### Check proportions of subword tokens

In [5]:
new_tokens_dict = {
'new_tokens_250': (set(list(tokenizer_250.vocab.keys())[0:255])-set(['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])),
'new_tokens_500': (set(list(tokenizer_500.vocab.keys())[0:505])-set(['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])),
'new_tokens_994': (set(list(tokenizer_994.vocab.keys())[0:999])-set(['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']))
}

In [6]:
def get_subword_stats(count):
    token_list = new_tokens_dict[f'new_tokens_{count}']
    subword_count = sum([1 for token in token_list if '##' in token])
    non_subword_count = count - subword_count
    subword_percent = round(subword_count/count*100, 2)
    print(f'\nnew geo tokens: {count}\nsubwords: {subword_count} \nnon-subwords: {non_subword_count}\nsubword percentage: {subword_percent}')

In [7]:
get_subword_stats(250)
get_subword_stats(500)
get_subword_stats(994)


new geo tokens: 250
subwords: 89 
non-subwords: 161
subword percentage: 35.6

new geo tokens: 500
subwords: 181 
non-subwords: 319
subword percentage: 36.2

new geo tokens: 994
subwords: 319 
non-subwords: 675
subword percentage: 32.09


For posterity:

```
new geo tokens: 250
subwords: 89 
non-subwords: 161
subword percentage: 35.6

new geo tokens: 500
subwords: 181 
non-subwords: 319
subword percentage: 36.2

new geo tokens: 994
subwords: 319 
non-subwords: 675
subword percentage: 32.09
```