In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset, DataCollatorForLanguageModeling

In [2]:
from transformers import BertConfig , BertForMaskedLM, TrainingArguments, Trainer, pipeline

In [3]:
files = ["smiles/canonical_train_scaffold.txt"]

In [4]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = WhitespaceSplit()

In [5]:
tokenizer.train(files, trainer)

In [6]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [7]:
max_length = 128
vocab_size = tokenizer.get_vocab_size() # can also try tokenizer.get_vocab_size()
model_path = 'smiles-bert/'

In [8]:
tokenizer.enable_truncation(max_length=max_length)

In [9]:
tokenizer.save('smiles-bert/tokenizer.json')

In [10]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./smiles-bert/tokenizer.json")

In [11]:
tokenizer.mask_token = "[MASK]"
tokenizer.unk_token = "[UNK]"
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"

In [12]:
%%time
from transformers import LineByLineTextDataset

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./smiles/canonical_train_scaffold.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./smiles/canonical_test.txt",
    block_size=128,
)

CPU times: user 5.52 ms, sys: 2.73 ms, total: 8.26 ms
Wall time: 3.26 ms




In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [14]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [15]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=50,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [16]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [17]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 50
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 50


Step,Training Loss,Validation Loss
1,7.0564,6.410741


***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-1
Configuration saved in smiles-bert/checkpoint-1/config.json
Model weights saved in smiles-bert/checkpoint-1/pytorch_model.bin


KeyboardInterrupt: 

In [None]:
# model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-10000"))
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
# perform predictions
example = "Clc1ccccc1 [MASK] S c1nnco1 CCC N"
for prediction in fill_mask(example):
    print(prediction)