In [1]:
from datasets import load_dataset
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, pipeline
from tokenizers import BertWordPieceTokenizer 
import os
import json

In [2]:
files = ["smiles/s_train.txt"]

In [3]:
dataset = load_dataset(".", data_files=files, split="train")

Using custom data configuration .-9cc97fad1bfe3436
Reusing dataset text (/Users/manasmahale/.cache/huggingface/datasets/text/.-9cc97fad1bfe3436/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4)


In [4]:
d = dataset.train_test_split(test_size=0.1)
d["train"], d["test"]

Loading cached split indices for dataset at /Users/manasmahale/.cache/huggingface/datasets/text/.-9cc97fad1bfe3436/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4/cache-19c9926bc3014c7b.arrow and /Users/manasmahale/.cache/huggingface/datasets/text/.-9cc97fad1bfe3436/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4/cache-9c6cac4364e941e0.arrow


(Dataset({
     features: ['text'],
     num_rows: 225
 }),
 Dataset({
     features: ['text'],
     num_rows: 25
 }))

In [5]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]

In [6]:
vocab_size = 1_000
max_length = 256
truncate_longer_samples = True

In [7]:
tokenizer = BertWordPieceTokenizer()

In [8]:
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)

In [9]:
tokenizer.enable_truncation(max_length=max_length)

In [10]:
model_path = "smiles-bert"

if not os.path.isdir(model_path):
    os.mkdir(model_path)

In [11]:
tokenizer.save_model(model_path)

['smiles-bert/vocab.txt']

In [12]:
with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": max_length,
      "max_len": max_length,
      }
    json.dump(tokenizer_cfg, f)

In [13]:
# when the tokenizer is trained and configured, load it as BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [14]:
def encode_with_truncation(examples):
    """Mapping function to tokenize the sentences passed with truncation"""
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length, return_special_tokens_mask=True)

In [15]:
def encode_without_truncation(examples):
    """Mapping function to tokenize the sentences passed without truncation"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [16]:
# the encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

In [17]:
with open('smiles/s_train.txt') as f:
    s_train = [i.strip() for i in f.readlines()]
    
with open('smiles/s_test.txt') as f:
    s_test = [i.strip() for i in f.readlines()]

In [18]:
# tokenizing the train dataset
train_dataset = d['train'].map(encode, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
# tokenizing the testing dataset
test_dataset = d["test"].map(encode, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
if truncate_longer_samples:
  # remove other columns and set input_ids and attention_mask as 
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
train_dataset, test_dataset

(Dataset({
     features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'text', 'token_type_ids'],
     num_rows: 225
 }),
 Dataset({
     features: ['attention_mask', 'input_ids', 'special_tokens_mask', 'text', 'token_type_ids'],
     num_rows: 25
 }))

In [21]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [22]:
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.

# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

In [23]:
if not truncate_longer_samples:
    train_dataset = train_dataset.map(group_texts, batched=True, batch_size=2_000,
                                    desc=f"Grouping texts in chunks of {max_length}")
    test_dataset = test_dataset.map(group_texts, batched=True, batch_size=2_000,
                                  num_proc=4, desc=f"Grouping texts in chunks of {max_length}")

len(test_dataset)

25

In [24]:
# initialize the model with the config
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [25]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [26]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=1,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=500,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [27]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: text, special_tokens_mask.
***** Running training *****
  Num examples = 225
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 80
  Gradient Accumulation steps = 8
  Total optimization steps = 2


Step,Training Loss,Validation Loss


In [None]:
# model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-10000"))
# tokenizer = BertTokenizerFast.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
# perform predictions
example = "CCC [MASK] CC"
for prediction in fill_mask(example):
    print(prediction)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs