In [1]:
import os
os.environ['HF_HOME'] = os.path.join(os.getcwd(), 'hf_cache')
from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetLMHeadModel, XLNetForSequenceClassification
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import utils

In [2]:
tokenizer = XLNetTokenizer(vocab_file='models/smiles_sp.model',
                           do_lower_case=False,
                           keep_accents=True
                           )

In [3]:
dataset = load_dataset('csv', data_files=['data/hiv/HIV.csv'])

Using custom data configuration default-26cac1113502f8a4
Reusing dataset csv (e:\molnlp\mol-prop\hf_cache\datasets\csv\default-26cac1113502f8a4\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|██████████| 1/1 [00:00<00:00, 500.63it/s]


In [4]:
max_len = utils.compute_max_length(dataset, 'smiles', tokenizer, 'data/hiv/hiv_len.txt')

def tokenize_function_hiv(examples):
    out_dict = tokenizer(examples["smiles"], padding='max_length', max_length=max_len)
    out_dict['labels'] = [int(x) for x in examples['HIV_active']]
    return out_dict

tokenized_datasets = dataset.map(tokenize_function_hiv, batched=True, remove_columns=["smiles","activity", "HIV_active"])

Loading cached processed dataset at e:\molnlp\mol-prop\hf_cache\datasets\csv\default-26cac1113502f8a4\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-6e0c8b7c166cf7b4.arrow


In [5]:
ds_split = tokenized_datasets['train'].train_test_split(0.1, shuffle=True, seed=42)

Loading cached split indices for dataset at e:\molnlp\mol-prop\hf_cache\datasets\csv\default-26cac1113502f8a4\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-92405330aa29f2fd.arrow and e:\molnlp\mol-prop\hf_cache\datasets\csv\default-26cac1113502f8a4\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-10ead6045a503917.arrow


In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 41127
    })
})

In [7]:

model_path = "models/xlnet-smiles-600000-10/checkpoint-60000"
model = XLNetForSequenceClassification.from_pretrained(model_path, config=model_path, num_labels=2, n_layer=12)
"""
model_config = XLNetConfig(
    vocab_size=tokenizer.vocab_size,
    n_layer=6,
    bi_data=True,
    num_labels=2
)
model = XLNetForSequenceClassification(model_config)
"""

Some weights of the model checkpoint at models/xlnet-smiles-600000-10/checkpoint-60000 were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at models/xlnet-smiles-600000-10/checkpoint-60000 and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model 

'\nmodel_config = XLNetConfig(\n    vocab_size=tokenizer.vocab_size,\n    n_layer=6,\n    bi_data=True,\n    num_labels=2\n)\nmodel = XLNetForSequenceClassification(model_config)\n'

In [8]:
training_args = TrainingArguments(
    f"models/xlnet-hiv",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=10,
    save_strategy='epoch',
    eval_accumulation_steps=1,
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_split['train'],
    eval_dataset=ds_split['test']
)

In [10]:
trainer.train()

***** Running training *****
  Num examples = 37014
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 185070
  0%|          | 1/185070 [00:01<64:06:20,  1.25s/it]

RuntimeError: CUDA out of memory. Tried to allocate 26.00 MiB (GPU 0; 6.00 GiB total capacity; 4.45 GiB already allocated; 0 bytes free; 4.62 GiB reserved in total by PyTorch)