# Large Language Models are Fragment Based Drug Designers 
## Author : Manas Mahale <<manas.mahale@bcp.edu.in>> 

In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset, DataCollatorForLanguageModeling

In [2]:
from transformers import BertConfig , BertForMaskedLM, TrainingArguments, Trainer, pipeline

In [3]:
files = ["smiles/canonical_train_scaffold.txt"]

In [4]:
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
trainer = WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = WhitespaceSplit()

In [5]:
tokenizer.train(files, trainer)

In [6]:
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [7]:
max_length = 128
vocab_size = tokenizer.get_vocab_size() # can also try tokenizer.get_vocab_size()
model_path = 'smiles-bert/'

In [8]:
tokenizer.enable_truncation(max_length=max_length)

In [9]:
tokenizer.save('smiles-bert/tokenizer.json')

In [10]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="./smiles-bert/tokenizer.json")

In [11]:
tokenizer.mask_token = "[MASK]"
tokenizer.unk_token = "[UNK]"
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"

In [12]:
%%time
from transformers import LineByLineTextDataset

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./smiles/canonical_train_scaffold.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./smiles/canonical_test.txt",
    block_size=128,
)

CPU times: user 5.13 ms, sys: 2.54 ms, total: 7.66 ms
Wall time: 3.31 ms




In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [14]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [15]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=50,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [16]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [17]:
# train the model
trainer.train()

***** Running training *****
  Num examples = 100
  Num Epochs = 50
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 50


Step,Training Loss,Validation Loss
1,4.9044,4.41652
2,5.2762,4.01903
3,5.0245,3.47709
4,5.0466,3.460977
5,4.8478,3.443731
6,4.9653,3.445435
7,4.5971,3.447366
8,5.1386,3.422358
9,4.4136,3.114475
10,4.6833,3.525752


***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-1
Configuration saved in smiles-bert/checkpoint-1/config.json
Model weights saved in smiles-bert/checkpoint-1/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-2
Configuration saved in smiles-bert/checkpoint-2/config.json
Model weights saved in smiles-bert/checkpoint-2/pytorch_model.bin
Deleting older checkpoint [smiles-bert/checkpoint-1] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-3
Configuration saved in smiles-bert/checkpoint-3/config.json
Model weights saved in smiles-bert/checkpoint-3/pytorch_model.bin
Deleting older checkpoint [smiles-bert/checkpoint-2] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to

Configuration saved in smiles-bert/checkpoint-26/config.json
Model weights saved in smiles-bert/checkpoint-26/pytorch_model.bin
Deleting older checkpoint [smiles-bert/checkpoint-25] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-27
Configuration saved in smiles-bert/checkpoint-27/config.json
Model weights saved in smiles-bert/checkpoint-27/pytorch_model.bin
Deleting older checkpoint [smiles-bert/checkpoint-26] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-28
Configuration saved in smiles-bert/checkpoint-28/config.json
Model weights saved in smiles-bert/checkpoint-28/pytorch_model.bin
Deleting older checkpoint [smiles-bert/checkpoint-27] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 20
  Batch size = 64
Saving model checkpoint to smiles-bert/checkpoint-29
C



Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from smiles-bert/checkpoint-33 (score: 1.544467806816101).


TrainOutput(global_step=50, training_loss=4.1330337762832645, metrics={'train_runtime': 718.4513, 'train_samples_per_second': 6.959, 'train_steps_per_second': 0.07, 'total_flos': 23304882566700.0, 'train_loss': 4.1330337762832645, 'epoch': 49.8})

In [18]:
# model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-10000"))
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [19]:
# perform predictions
example = "Clc1ccccc1 [MASK] S c1nnco1 CCC N"
for prediction in fill_mask(example):
    print(prediction)

{'score': 0.18074874579906464, 'token': 6, 'token_str': 'CC=O', 'sequence': 'Clc1ccccc1 CC=O S c1nnco1 CCC N'}
{'score': 0.13201986253261566, 'token': 11, 'token_str': 'CCC', 'sequence': 'Clc1ccccc1 CCC S c1nnco1 CCC N'}
{'score': 0.06030702218413353, 'token': 7, 'token_str': 'C=O', 'sequence': 'Clc1ccccc1 C=O S c1nnco1 CCC N'}
{'score': 0.05809904262423515, 'token': 15, 'token_str': 'CCC=O', 'sequence': 'Clc1ccccc1 CCC=O S c1nnco1 CCC N'}
{'score': 0.03378932178020477, 'token': 9, 'token_str': 'c1ccccc1', 'sequence': 'Clc1ccccc1 c1ccccc1 S c1nnco1 CCC N'}
