In [1]:
# https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb
# https://github.com/huggingface/transformers/tree/master/notebooks
# https://huggingface.co/transformers/model_doc/xlnet.html#transformers.XLNetTokenizer 

In [2]:
import os
os.environ['HF_HOME'] = os.path.join(os.getcwd(), 'hf_cache')

from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetLMHeadModel 
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from tqdm import tqdm

In [3]:
tokenizer = XLNetTokenizer(vocab_file='models/smiles_sp.model',
                           do_lower_case=False,
                           keep_accents=True
                           )

In [4]:
dataset = load_dataset('text', data_files=['data/proc_zinc/all.txt'])

Using custom data configuration default-ce80c9ae12ee94b9
Reusing dataset text (e:\molnlp\mol-prop\hf_cache\datasets\text\default-ce80c9ae12ee94b9\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
100%|██████████| 1/1 [00:00<00:00,  4.59it/s]


In [5]:
print(dataset['train'][1]['text'])
print(tokenizer(dataset['train'][1]['text'], padding='max_length', max_length=100))

CC(C)(C)OC(=O)N[C@H]1CONC1=O
{'input_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 3, 11, 3, 4, 11, 3, 4, 5, 3, 10, 5, 4, 9, 7, 3, 12, 6, 15, 3, 5, 9, 3, 39, 5, 0, 0], 'token_type_ids': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
max_len_path = 'data/proc_zinc/max_len.txt'
if os.path.exists(max_len_path):
    max_len = int(open(max_len_path).read())
else:
    max_len = 0
    with tqdm(total=len(dataset['train'])) as pbar:
        for data in dataset['train']:
            max_len = max(len(data['text']), max_len)
            pbar.update(1)
    with open(max_len_path, 'w') as f:
        f.write(str(max_len))

In [7]:
def tokenize_function(examples):
    out_dict = tokenizer(examples["text"], padding='max_length', max_length=max_len)
    out_dict['labels'] = out_dict['input_ids'].copy()
    return out_dict


tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Loading cached processed dataset at e:\molnlp\mol-prop\hf_cache\datasets\text\default-ce80c9ae12ee94b9\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5\cache-151da96b9fb5f3c8.arrow


In [8]:
model_config = XLNetConfig(
    vocab_size=tokenizer.vocab_size,
    n_layer=12
)
model = XLNetLMHeadModel(model_config)

In [9]:
max_samples = 200000
batch_size = 10
max_steps = max_samples // batch_size
save_steps = max_steps // 5
training_args = TrainingArguments(
    f"models/xlnet-smiles-bs-{batch_size}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    max_steps = max_steps,
    #fp16=True,
    save_steps=save_steps,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
trainer.train()

***** Running training *****
  Num examples = 6072715
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 20000
  2%|▎         | 500/20000 [04:06<2:27:56,  2.20it/s]

{'loss': 0.0166, 'learning_rate': 1.95e-05, 'epoch': 0.0}


  5%|▌         | 1000/20000 [08:07<2:55:36,  1.80it/s]

{'loss': 0.0005, 'learning_rate': 1.9e-05, 'epoch': 0.0}


  8%|▊         | 1500/20000 [12:35<2:51:46,  1.79it/s]

{'loss': 0.0003, 'learning_rate': 1.8500000000000002e-05, 'epoch': 0.0}


 10%|█         | 2000/20000 [17:14<2:49:06,  1.77it/s]

{'loss': 0.0002, 'learning_rate': 1.8e-05, 'epoch': 0.0}


 12%|█▎        | 2500/20000 [21:43<2:30:25,  1.94it/s]

{'loss': 0.0001, 'learning_rate': 1.7500000000000002e-05, 'epoch': 0.0}


 15%|█▌        | 3000/20000 [26:04<2:30:32,  1.88it/s]

{'loss': 0.0001, 'learning_rate': 1.7e-05, 'epoch': 0.0}


 18%|█▊        | 3500/20000 [30:28<2:27:49,  1.86it/s]

{'loss': 0.0001, 'learning_rate': 1.65e-05, 'epoch': 0.01}


 20%|██        | 4000/20000 [35:03<2:18:30,  1.93it/s]Saving model checkpoint to models/xlnet-smiles\checkpoint-4000
Configuration saved in models/xlnet-smiles\checkpoint-4000\config.json


{'loss': 0.0001, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.01}


Model weights saved in models/xlnet-smiles\checkpoint-4000\pytorch_model.bin
 22%|██▎       | 4500/20000 [39:59<2:29:13,  1.73it/s]

{'loss': 0.0, 'learning_rate': 1.55e-05, 'epoch': 0.01}


 25%|██▌       | 5000/20000 [44:28<2:06:05,  1.98it/s]

{'loss': 0.0, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.01}


 28%|██▊       | 5500/20000 [48:50<2:06:20,  1.91it/s]

{'loss': 0.0, 'learning_rate': 1.45e-05, 'epoch': 0.01}


 30%|███       | 6000/20000 [53:08<2:01:11,  1.93it/s]

{'loss': 0.0, 'learning_rate': 1.4e-05, 'epoch': 0.01}


 32%|███▎      | 6500/20000 [57:25<2:03:12,  1.83it/s]

{'loss': 0.0, 'learning_rate': 1.3500000000000001e-05, 'epoch': 0.01}


 35%|███▌      | 7000/20000 [1:01:44<1:50:55,  1.95it/s]

{'loss': 0.0, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


 38%|███▊      | 7500/20000 [1:06:00<1:45:13,  1.98it/s]

{'loss': 0.0, 'learning_rate': 1.25e-05, 'epoch': 0.01}


 40%|████      | 8000/20000 [1:10:15<1:40:13,  2.00it/s]Saving model checkpoint to models/xlnet-smiles\checkpoint-8000
Configuration saved in models/xlnet-smiles\checkpoint-8000\config.json


{'loss': 0.0, 'learning_rate': 1.2e-05, 'epoch': 0.01}


Model weights saved in models/xlnet-smiles\checkpoint-8000\pytorch_model.bin
 42%|████▎     | 8500/20000 [1:14:46<1:33:05,  2.06it/s]

{'loss': 0.0, 'learning_rate': 1.15e-05, 'epoch': 0.01}


 45%|████▌     | 9000/20000 [1:18:58<1:31:59,  1.99it/s]

{'loss': 0.0, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


 48%|████▊     | 9500/20000 [1:23:06<1:29:27,  1.96it/s]

{'loss': 0.0, 'learning_rate': 1.0500000000000001e-05, 'epoch': 0.02}


 50%|█████     | 10000/20000 [1:27:13<1:24:28,  1.97it/s]

{'loss': 0.0, 'learning_rate': 1e-05, 'epoch': 0.02}


 52%|█████▎    | 10500/20000 [1:31:22<1:18:51,  2.01it/s]

{'loss': 0.0, 'learning_rate': 9.5e-06, 'epoch': 0.02}


 55%|█████▌    | 11000/20000 [1:35:28<1:13:26,  2.04it/s]

{'loss': 0.0, 'learning_rate': 9e-06, 'epoch': 0.02}


 57%|█████▊    | 11500/20000 [1:39:37<1:10:22,  2.01it/s]

{'loss': 0.0, 'learning_rate': 8.5e-06, 'epoch': 0.02}


 60%|██████    | 12000/20000 [1:43:44<1:06:32,  2.00it/s]Saving model checkpoint to models/xlnet-smiles\checkpoint-12000
Configuration saved in models/xlnet-smiles\checkpoint-12000\config.json


{'loss': 0.0, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}


Model weights saved in models/xlnet-smiles\checkpoint-12000\pytorch_model.bin
 62%|██████▎   | 12500/20000 [1:48:08<1:00:13,  2.08it/s]

{'loss': 0.0, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.02}


 65%|██████▌   | 13000/20000 [1:52:15<57:49,  2.02it/s]

{'loss': 0.0, 'learning_rate': 7e-06, 'epoch': 0.02}


 68%|██████▊   | 13500/20000 [1:56:21<51:36,  2.10it/s]

{'loss': 0.0, 'learning_rate': 6.5000000000000004e-06, 'epoch': 0.02}


 70%|███████   | 14000/20000 [2:00:23<48:26,  2.06it/s]

{'loss': 0.0, 'learning_rate': 6e-06, 'epoch': 0.02}


 72%|███████▎  | 14500/20000 [2:04:21<40:34,  2.26it/s]

{'loss': 0.0, 'learning_rate': 5.500000000000001e-06, 'epoch': 0.02}


 75%|███████▌  | 15000/20000 [2:08:18<39:16,  2.12it/s]

{'loss': 0.0, 'learning_rate': 5e-06, 'epoch': 0.02}


 78%|███████▊  | 15500/20000 [2:12:14<34:19,  2.18it/s]

{'loss': 0.0, 'learning_rate': 4.5e-06, 'epoch': 0.03}


 80%|████████  | 16000/20000 [2:16:12<30:55,  2.16it/s]Saving model checkpoint to models/xlnet-smiles\checkpoint-16000
Configuration saved in models/xlnet-smiles\checkpoint-16000\config.json


{'loss': 0.0, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.03}


Model weights saved in models/xlnet-smiles\checkpoint-16000\pytorch_model.bin
 82%|████████▎ | 16500/20000 [2:20:28<26:52,  2.17it/s]

{'loss': 0.0, 'learning_rate': 3.5e-06, 'epoch': 0.03}


 85%|████████▌ | 17000/20000 [2:24:23<22:20,  2.24it/s]

{'loss': 0.0, 'learning_rate': 3e-06, 'epoch': 0.03}


 88%|████████▊ | 17500/20000 [2:28:18<20:28,  2.04it/s]

{'loss': 0.0, 'learning_rate': 2.5e-06, 'epoch': 0.03}


 90%|█████████ | 18000/20000 [2:32:13<16:00,  2.08it/s]

{'loss': 0.0, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.03}


 92%|█████████▎| 18500/20000 [2:36:05<12:02,  2.07it/s]

{'loss': 0.0, 'learning_rate': 1.5e-06, 'epoch': 0.03}


 95%|█████████▌| 19000/20000 [2:39:59<07:54,  2.11it/s]

{'loss': 0.0, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}


 98%|█████████▊| 19500/20000 [2:43:52<03:41,  2.25it/s]

{'loss': 0.0, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.03}


100%|██████████| 20000/20000 [2:47:43<00:00,  2.19it/s]Saving model checkpoint to models/xlnet-smiles\checkpoint-20000
Configuration saved in models/xlnet-smiles\checkpoint-20000\config.json


{'loss': 0.0, 'learning_rate': 0.0, 'epoch': 0.03}


Model weights saved in models/xlnet-smiles\checkpoint-20000\pytorch_model.bin


ValueError: Trainer: evaluation requires an eval_dataset.