In [1]:
# Model to be downloaded
pretrained_model_name = 'bert-base-uncased'
model_head = 'MaskedLM' # Possible options: 'MaskedLM', 'SequenceClassification'

In [2]:
local_model_base_dir = '../local_models/'
# Location to save the model
updated_model_name = '_'.join([pretrained_model_name, model_head, 'STR_option2'])
model_checkpoint_dir = local_model_base_dir + updated_model_name
print(f'Updated model name will be:{updated_model_name}')

Updated model name will be:bert-base-uncased_MaskedLM_STR_option1


In [3]:
# New Vocab tokens to be added
new_tokens_file_path = '../data/token_files/option2_idiom_tokens.txt'

## 1. Download the required pretrained LM model

In [4]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer

import re
import os
import sys
import shutil

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
if model_head == 'MaskedLM':
    model_class = AutoModelForMaskedLM
elif model_head == 'SequenceClassification':
    model_class = AutoModelForSequenceClassification
else:
    raise ValueError(f'Model head {model_head} is not supported')

# Download the model with appropriate head
print(f'Model class is:{model_class}')
model = model_class.from_pretrained(pretrained_model_name)
# Download the Tokenizer model
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, use_fast=False, truncation=True)
print(f"Loaded both the LM Model & the Tokenizer models")

Model class is:<class 'types.AutoModelForMaskedLM'>


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded both the LM Model & the Tokenizer models


## 2. Update the Tokenizer & LM by adding single-token-representations

In [6]:
# Load the tokens to be inserted into the vocab
new_tokens = open(new_tokens_file_path).read().lstrip().rstrip().split('\n')
new_tokens_count = len(new_tokens)
print(f'Going to add {new_tokens_count} new tokens to the vocabulary')

Going to add 1738 new tokens to the vocabulary


In [7]:
# Add the new tokens
num_added_toks = tokenizer.add_tokens(new_tokens)
# Resize the model embedding dimensions
print(f'Input Embeddings before update: {model.get_input_embeddings()}')
embedding = model.resize_token_embeddings(len(tokenizer))
print(f'Input Embeddings after update: {embedding}')

Input Embeddings before update: Embedding(30522, 768, padding_idx=0)
Input Embeddings after update: Embedding(32260, 768)


In [8]:
model.save_pretrained(model_checkpoint_dir)
tokenizer.save_pretrained(model_checkpoint_dir)
print(f'Added {num_added_toks} new tokens to the model at {model_checkpoint_dir}')

Added 1738 new tokens to the model at ../local_models/bert-base-uncased_MaskedLM_STR_option1


### Test the updated Tokenizer & LM models

In [9]:
# Load the tokenizer with updated vocab file
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_dir, use_fast= False, \
                                          max_length=510, force_download=True)

In [10]:
# Test for MWE single-tokens
test_tokens = ['IDoffthebeatentrackID', 'IDchapterandverseID']

for token in test_tokens:
#     assert tokenizer.tokenize(f'This is a {token}')[-1] == token
    print(tokenizer.tokenize(f'This is a {token}'))

print('\n')
print(f'SUCCESS!! The {updated_model_name} model has been updated with new tokens!!')

['this', 'is', 'a', 'idoffthebeatentrackid']
['this', 'is', 'a', 'idchapterandverseid']


SUCCESS!! The bert-base-uncased_MaskedLM_STR_option1 model has been updated with new tokens!!


### Testing tokenizer for random pre-trained model

In [11]:
assert False # comment this line if you want to execute below

from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification
from transformers import AutoTokenizer

import re
import os
import sys
import shutil

AssertionError: 

In [None]:
model_checkpoint_dir = '../local_models/bert-base-uncased_option1_with_bertram'

# Load the tokenizer with updated vocab file
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_dir, use_fast= False, \
                                          max_length=510, force_download=True)

In [None]:
# Test for MWE single-tokens
test_tokens = ['<BERTRAM:IDaheadofthecurveID>'.lower(), '<BERTRAM:IDaheadofthecurveID>', 'IDchapterandverseID', '<BERTRAM:IDcallaspadeaspadeID>']

for token in test_tokens:
#     assert tokenizer.tokenize(f'This is a {token}')[-1] == token
    print(tokenizer.tokenize(f'This is a {token}'))

## References
Reference implementations
1. [Adding new Tokens](https://huggingface.co/transformers/v2.11.0/main_classes/tokenizer.html#transformers.PreTrainedTokenizer.add_tokens)
2. [Manual method of adding tokens](https://github.com/H-TayyarMadabushi/AStitchInLanguageModels/blob/main/Dataset/Task2/README.md#adding-idiom-tokens-to--transformers-models)