In [4]:
# https://github.com/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb
# https://github.com/huggingface/transformers/tree/master/notebooks
# https://huggingface.co/transformers/model_doc/xlnet.html#transformers.XLNetTokenizer 
# https://colab.research.google.com/github/gmihaila/ml_things/blob/master/notebooks/pytorch/pretrain_transformers_pytorch.ipynb#scrollTo=VE2MRZZhd5uM 

In [5]:
import os
os.environ['HF_HOME'] = os.path.join(os.getcwd(), 'hf_cache')

from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetLMHeadModel 
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorForPermutationLanguageModeling
from tqdm import tqdm

In [6]:
tokenizer = XLNetTokenizer(vocab_file='models/smiles_hiv_sp.model',
                           do_lower_case=False,
                           keep_accents=True
                           )

In [10]:
train_raw = load_dataset('csv', data_files=['data/ogb_molhiv/train_hiv.csv'])
test_raw = load_dataset('csv', data_files=['data/ogb_molhiv/test_hiv.csv'])
valid_raw = load_dataset('csv', data_files=['data/ogb_molhiv/valid_hiv.csv'])

Using custom data configuration default-0250d14726bc71f8
Reusing dataset csv (e:\molnlp\mol-prop\hf_cache\datasets\csv\default-0250d14726bc71f8\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|██████████| 1/1 [00:00<00:00, 76.95it/s]
Using custom data configuration default-d85d5e570e46467b
Reusing dataset csv (e:\molnlp\mol-prop\hf_cache\datasets\csv\default-d85d5e570e46467b\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|██████████| 1/1 [00:00<00:00, 71.45it/s]
Using custom data configuration default-e9711672e6359f2b
Reusing dataset csv (e:\molnlp\mol-prop\hf_cache\datasets\csv\default-e9711672e6359f2b\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|██████████| 1/1 [00:00<00:00, 62.49it/s]


In [27]:
tokenizer.special_tokens_map
#print(tokenizer('Clc1ccnc2c1ccc1c(Cl)ccnc12'))

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '<sep>',
 'pad_token': '<pad>',
 'cls_token': '<cls>',
 'mask_token': '<mask>',
 'additional_special_tokens': ['<eop>', '<eod>']}

In [32]:
tokenizer

<bound method XLNetTokenizer.get_special_tokens_mask of PreTrainedTokenizer(name_or_path='', vocab_size=1092, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='left', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['<eop>', '<eod>']})>

In [15]:
test_id = 52
print(train_raw['train'][test_id]['smiles'])
#input_ids =tokenizer(train_raw['train'][test_id]['smiles'], padding='max_length', max_length=100)['input_ids'] 
input_ids =tokenizer(train_raw['train'][test_id]['smiles'])['input_ids'] 
print(input_ids)
print(tokenizer.convert_ids_to_tokens(input_ids))

CCOC(=O)C(=O)C1CCCCC1=O
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53, 10, 7, 4, 3, 10, 7, 4, 3, 8, 70, 35, 7, 0, 0]
['<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<u

In [19]:

print(tokenizer.pad_token_id)
print(tokenizer.unk_token_id)

0
0


In [11]:
def tokenize_function_hiv(examples):
    out_dict = tokenizer(examples["smiles"])
    return out_dict

train_ds = train_raw.map(tokenize_function_hiv, batched=True, remove_columns=["smiles","HIV_active", "mol_id"])['train']
test_ds = test_raw.map(tokenize_function_hiv, batched=True, remove_columns=["smiles","HIV_active", "mol_id"])['train']
valid_ds = valid_raw.map(tokenize_function_hiv, batched=True, remove_columns=["smiles","HIV_active", "mol_id"])['train']

100%|██████████| 33/33 [00:04<00:00,  6.94ba/s]
100%|██████████| 5/5 [00:00<00:00,  8.72ba/s]
100%|██████████| 5/5 [00:00<00:00,  7.65ba/s]


In [7]:
n_layer = 4
model_config = XLNetConfig(
    vocab_size=tokenizer.vocab_size,
    n_layer=4,
    bi_data=True
)
model = XLNetLMHeadModel(model_config)

data_collator = DataCollatorForPermutationLanguageModeling(tokenizer=tokenizer)

In [8]:
max_samples = 2000000
batch_size = 70 
max_steps = max_samples // batch_size
save_steps = max_steps // 5
training_args = TrainingArguments(
    f"models/xlnet-smiles-s{max_samples}-b{batch_size}-l{n_layer}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    num_train_epochs=1,
    max_steps = max_steps,
    save_steps=save_steps,
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    data_collator=data_collator
)

max_steps is given, it will override any value given in num_train_epochs


In [10]:
trainer.train()

***** Running training *****
  Num examples = 6072715
  Num Epochs = 1
  Instantaneous batch size per device = 70
  Total train batch size (w. parallel, distributed & accumulation) = 70
  Gradient Accumulation steps = 1
  Total optimization steps = 28571
  2%|▏         | 500/28571 [08:45<8:10:39,  1.05s/it]

{'loss': 2.3886, 'learning_rate': 1.964999474992125e-05, 'epoch': 0.01}


  4%|▎         | 1000/28571 [17:30<8:02:43,  1.05s/it]

{'loss': 1.5815, 'learning_rate': 1.92999894998425e-05, 'epoch': 0.01}


  5%|▌         | 1500/28571 [25:48<7:23:28,  1.02it/s]

{'loss': 1.3943, 'learning_rate': 1.8949984249763747e-05, 'epoch': 0.02}


  7%|▋         | 2000/28571 [33:58<7:13:48,  1.02it/s]

{'loss': 1.2685, 'learning_rate': 1.8599978999685e-05, 'epoch': 0.02}


  9%|▉         | 2500/28571 [42:08<7:06:48,  1.02it/s]

{'loss': 1.1968, 'learning_rate': 1.8249973749606244e-05, 'epoch': 0.03}


 11%|█         | 3000/28571 [50:18<6:57:30,  1.02it/s]

{'loss': 1.1427, 'learning_rate': 1.7899968499527493e-05, 'epoch': 0.03}


 12%|█▏        | 3500/28571 [58:28<6:49:55,  1.02it/s]

{'loss': 1.0905, 'learning_rate': 1.754996324944874e-05, 'epoch': 0.04}


 14%|█▍        | 4000/28571 [1:06:39<6:41:49,  1.02it/s]

{'loss': 1.0507, 'learning_rate': 1.719995799936999e-05, 'epoch': 0.05}


 16%|█▌        | 4500/28571 [1:14:49<6:33:37,  1.02it/s]

{'loss': 1.021, 'learning_rate': 1.6849952749291242e-05, 'epoch': 0.05}


 18%|█▊        | 5000/28571 [1:22:59<6:25:00,  1.02it/s]

{'loss': 0.9964, 'learning_rate': 1.649994749921249e-05, 'epoch': 0.06}


 19%|█▉        | 5500/28571 [1:31:09<6:17:59,  1.02it/s]

{'loss': 0.9656, 'learning_rate': 1.614994224913374e-05, 'epoch': 0.06}


 20%|█▉        | 5714/28571 [1:34:39<6:13:41,  1.02it/s]Saving model checkpoint to models/xlnet-smiles-s2000000-b70-l4\checkpoint-5714
Configuration saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-5714\config.json
Model weights saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-5714\pytorch_model.bin
 21%|██        | 6000/28571 [1:39:40<6:28:01,  1.03s/it]

{'loss': 0.9298, 'learning_rate': 1.5799936999054988e-05, 'epoch': 0.07}


 23%|██▎       | 6500/28571 [1:48:15<6:16:32,  1.02s/it]

{'loss': 0.9051, 'learning_rate': 1.5449931748976236e-05, 'epoch': 0.07}


 25%|██▍       | 7000/28571 [1:56:44<6:04:18,  1.01s/it]

{'loss': 0.8928, 'learning_rate': 1.5099926498897483e-05, 'epoch': 0.08}


 26%|██▋       | 7500/28571 [2:05:07<5:56:33,  1.02s/it]

{'loss': 0.878, 'learning_rate': 1.4749921248818732e-05, 'epoch': 0.09}


 28%|██▊       | 8000/28571 [2:13:26<5:41:20,  1.00it/s]

{'loss': 0.8622, 'learning_rate': 1.4399915998739982e-05, 'epoch': 0.09}


 30%|██▉       | 8500/28571 [2:21:44<5:31:05,  1.01it/s]

{'loss': 0.8508, 'learning_rate': 1.404991074866123e-05, 'epoch': 0.1}


 32%|███▏      | 9000/28571 [2:30:01<5:25:35,  1.00it/s]

{'loss': 0.8251, 'learning_rate': 1.369990549858248e-05, 'epoch': 0.1}


 33%|███▎      | 9500/28571 [2:38:16<5:16:36,  1.00it/s]

{'loss': 0.8249, 'learning_rate': 1.334990024850373e-05, 'epoch': 0.11}


 35%|███▌      | 10000/28571 [2:46:33<5:05:15,  1.01it/s]

{'loss': 0.8078, 'learning_rate': 1.2999894998424978e-05, 'epoch': 0.12}


 37%|███▋      | 10500/28571 [2:54:48<5:00:11,  1.00it/s]

{'loss': 0.8002, 'learning_rate': 1.2649889748346227e-05, 'epoch': 0.12}


 39%|███▊      | 11000/28571 [3:03:06<4:51:47,  1.00it/s]

{'loss': 0.791, 'learning_rate': 1.2299884498267475e-05, 'epoch': 0.13}


 40%|███▉      | 11428/28571 [3:10:13<4:46:27,  1.00s/it]Saving model checkpoint to models/xlnet-smiles-s2000000-b70-l4\checkpoint-11428
Configuration saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-11428\config.json
Model weights saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-11428\pytorch_model.bin
 40%|████      | 11500/28571 [3:11:28<4:43:08,  1.00it/s]

{'loss': 0.7793, 'learning_rate': 1.1949879248188726e-05, 'epoch': 0.13}


 42%|████▏     | 12000/28571 [3:19:48<4:36:14,  1.00s/it]

{'loss': 0.7707, 'learning_rate': 1.1599873998109972e-05, 'epoch': 0.14}


 44%|████▍     | 12500/28571 [3:28:06<4:27:11,  1.00it/s]

{'loss': 0.7669, 'learning_rate': 1.1249868748031221e-05, 'epoch': 0.14}


 46%|████▌     | 13000/28571 [3:36:23<4:18:49,  1.00it/s]

{'loss': 0.7555, 'learning_rate': 1.089986349795247e-05, 'epoch': 0.15}


 47%|████▋     | 13500/28571 [3:44:41<4:09:30,  1.01it/s]

{'loss': 0.7583, 'learning_rate': 1.0549858247873718e-05, 'epoch': 0.16}


 49%|████▉     | 14000/28571 [3:52:58<4:04:53,  1.01s/it]

{'loss': 0.7494, 'learning_rate': 1.0199852997794968e-05, 'epoch': 0.16}


 51%|█████     | 14500/28571 [4:01:15<3:52:12,  1.01it/s]

{'loss': 0.7288, 'learning_rate': 9.849847747716217e-06, 'epoch': 0.17}


 53%|█████▎    | 15000/28571 [4:09:30<3:42:33,  1.02it/s]

{'loss': 0.7338, 'learning_rate': 9.499842497637466e-06, 'epoch': 0.17}


 54%|█████▍    | 15500/28571 [4:17:45<3:35:34,  1.01it/s]

{'loss': 0.7374, 'learning_rate': 9.149837247558714e-06, 'epoch': 0.18}


 56%|█████▌    | 16000/28571 [4:25:59<3:28:08,  1.01it/s]

{'loss': 0.7096, 'learning_rate': 8.799831997479963e-06, 'epoch': 0.18}


 58%|█████▊    | 16500/28571 [4:34:12<3:18:21,  1.01it/s]

{'loss': 0.7157, 'learning_rate': 8.449826747401211e-06, 'epoch': 0.19}


 60%|█████▉    | 17000/28571 [4:42:25<3:09:39,  1.02it/s]

{'loss': 0.7072, 'learning_rate': 8.099821497322462e-06, 'epoch': 0.2}


 60%|█████▉    | 17142/28571 [4:44:45<3:06:29,  1.02it/s]Saving model checkpoint to models/xlnet-smiles-s2000000-b70-l4\checkpoint-17142
Configuration saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-17142\config.json
Model weights saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-17142\pytorch_model.bin
 61%|██████▏   | 17500/28571 [4:50:41<3:02:02,  1.01it/s]

{'loss': 0.7022, 'learning_rate': 7.749816247243708e-06, 'epoch': 0.2}


 63%|██████▎   | 18000/28571 [4:58:54<2:54:46,  1.01it/s]

{'loss': 0.6951, 'learning_rate': 7.399810997164958e-06, 'epoch': 0.21}


 65%|██████▍   | 18500/28571 [5:07:07<2:45:57,  1.01it/s]

{'loss': 0.6934, 'learning_rate': 7.049805747086207e-06, 'epoch': 0.21}


 67%|██████▋   | 19000/28571 [5:15:20<2:37:36,  1.01it/s]

{'loss': 0.6933, 'learning_rate': 6.699800497007456e-06, 'epoch': 0.22}


 68%|██████▊   | 19500/28571 [5:23:33<2:28:36,  1.02it/s]

{'loss': 0.6825, 'learning_rate': 6.349795246928705e-06, 'epoch': 0.22}


 70%|███████   | 20000/28571 [5:31:45<2:20:41,  1.02it/s]

{'loss': 0.6789, 'learning_rate': 5.999789996849953e-06, 'epoch': 0.23}


 72%|███████▏  | 20500/28571 [5:39:58<2:12:36,  1.01it/s]

{'loss': 0.6728, 'learning_rate': 5.6497847467712015e-06, 'epoch': 0.24}


 74%|███████▎  | 21000/28571 [5:48:09<2:04:01,  1.02it/s]

{'loss': 0.6713, 'learning_rate': 5.299779496692451e-06, 'epoch': 0.24}


 75%|███████▌  | 21500/28571 [5:56:22<1:55:48,  1.02it/s]

{'loss': 0.6658, 'learning_rate': 4.9497742466136995e-06, 'epoch': 0.25}


 77%|███████▋  | 22000/28571 [6:04:33<1:47:46,  1.02it/s]

{'loss': 0.6677, 'learning_rate': 4.599768996534948e-06, 'epoch': 0.25}


 79%|███████▉  | 22500/28571 [6:12:44<1:39:14,  1.02it/s]

{'loss': 0.6641, 'learning_rate': 4.249763746456197e-06, 'epoch': 0.26}


 80%|███████▉  | 22856/28571 [6:18:34<1:33:20,  1.02it/s]Saving model checkpoint to models/xlnet-smiles-s2000000-b70-l4\checkpoint-22856
Configuration saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-22856\config.json
Model weights saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-22856\pytorch_model.bin
 81%|████████  | 23000/28571 [6:20:57<1:31:02,  1.02it/s]

{'loss': 0.661, 'learning_rate': 3.899758496377446e-06, 'epoch': 0.27}


 82%|████████▏ | 23500/28571 [6:29:08<1:23:19,  1.01it/s]

{'loss': 0.6632, 'learning_rate': 3.549753246298695e-06, 'epoch': 0.27}


 84%|████████▍ | 24000/28571 [6:37:21<1:14:40,  1.02it/s]

{'loss': 0.6549, 'learning_rate': 3.1997479962199436e-06, 'epoch': 0.28}


 86%|████████▌ | 24500/28571 [6:45:32<1:06:34,  1.02it/s]

{'loss': 0.6574, 'learning_rate': 2.8497427461411926e-06, 'epoch': 0.28}


 88%|████████▊ | 25000/28571 [6:53:43<58:23,  1.02it/s]

{'loss': 0.6446, 'learning_rate': 2.499737496062441e-06, 'epoch': 0.29}


 89%|████████▉ | 25500/28571 [7:01:54<50:14,  1.02it/s]

{'loss': 0.6556, 'learning_rate': 2.14973224598369e-06, 'epoch': 0.29}


 91%|█████████ | 26000/28571 [7:10:05<42:05,  1.02it/s]

{'loss': 0.6485, 'learning_rate': 1.7997269959049388e-06, 'epoch': 0.3}


 93%|█████████▎| 26500/28571 [7:18:16<33:55,  1.02it/s]

{'loss': 0.6531, 'learning_rate': 1.4497217458261875e-06, 'epoch': 0.31}


 95%|█████████▍| 27000/28571 [7:26:27<25:37,  1.02it/s]

{'loss': 0.6426, 'learning_rate': 1.0997164957474363e-06, 'epoch': 0.31}


 96%|█████████▋| 27500/28571 [7:34:37<17:27,  1.02it/s]

{'loss': 0.6497, 'learning_rate': 7.49711245668685e-07, 'epoch': 0.32}


 98%|█████████▊| 28000/28571 [7:42:48<09:18,  1.02it/s]

{'loss': 0.641, 'learning_rate': 3.9970599558993383e-07, 'epoch': 0.32}


100%|█████████▉| 28500/28571 [7:50:58<01:09,  1.02it/s]

{'loss': 0.6445, 'learning_rate': 4.9700745511182674e-08, 'epoch': 0.33}


100%|█████████▉| 28570/28571 [7:52:07<00:00,  1.02it/s]Saving model checkpoint to models/xlnet-smiles-s2000000-b70-l4\checkpoint-28570
Configuration saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-28570\config.json
Model weights saved in models/xlnet-smiles-s2000000-b70-l4\checkpoint-28570\pytorch_model.bin
100%|██████████| 28571/28571 [7:52:09<00:00,  1.29s/it]

ValueError: Trainer: evaluation requires an eval_dataset.