In [2]:
import os
os.environ['HF_HOME'] = os.path.join(os.getcwd(), 'hf_cache')
from transformers import XLNetConfig, XLNetModel, XLNetTokenizer, XLNetLMHeadModel, XLNetForSequenceClassification
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import utils
from datasets import load_metric
from transformers.data.data_collator import DataCollatorWithPadding
import numpy as np
from tqdm import tqdm

In [3]:
tokenizer = XLNetTokenizer(vocab_file='models/smiles_sp.model',
                           do_lower_case=False,
                           keep_accents=True
                           )

In [4]:
dataset = load_dataset('csv', data_files=['data/hiv/HIV.csv'])

Using custom data configuration default-61b52d1e11be2e70
Reusing dataset csv (e:\molnlp\mol-prop\hf_cache\datasets\csv\default-61b52d1e11be2e70\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)
100%|██████████| 1/1 [00:00<00:00, 499.62it/s]


In [5]:
max_len = utils.compute_max_length(dataset, 'smiles', tokenizer, 'data/hiv/hiv_len.txt')

def tokenize_function_hiv(examples):
    out_dict = tokenizer(examples["smiles"])
    out_dict['label'] = [int(x) for x in examples['HIV_active']]
    return out_dict

tokenized_datasets = dataset.map(tokenize_function_hiv, batched=True, remove_columns=["smiles","activity", "HIV_active"])

Loading cached processed dataset at e:\molnlp\mol-prop\hf_cache\datasets\csv\default-61b52d1e11be2e70\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-afee11334f9dac35.arrow


In [6]:
# Seems like trainer errors on odd splits. 
test_num = len(tokenized_datasets['train']) // 10
if test_num % 2 == 1:
    test_num -= 1
ds_split = tokenized_datasets['train'].train_test_split(test_num, shuffle=True, seed=42)
#ds_split = tokenized_datasets['train'].shard(300,1).train_test_split(0.1, shuffle=True, seed=42)

Loading cached split indices for dataset at e:\molnlp\mol-prop\hf_cache\datasets\csv\default-61b52d1e11be2e70\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-1d7a3e6967112968.arrow and e:\molnlp\mol-prop\hf_cache\datasets\csv\default-61b52d1e11be2e70\0.0.0\bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a\cache-1bed92519eac4f6d.arrow


In [7]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],
        num_rows: 37014
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'token_type_ids'],
        num_rows: 4112
    })
})

In [8]:

model_path = "models/xlnet-smiles-s2000000-b70-l4/checkpoint-5714"
model = XLNetForSequenceClassification.from_pretrained(model_path, config=model_path, num_labels=2)
"""
model_config = XLNetConfig(
    vocab_size=tokenizer.vocab_size,
    n_layer=6,
    bi_data=True,
    num_labels=2
)
model = XLNetForSequenceClassification(model_config)
"""

Some weights of the model checkpoint at models/xlnet-smiles-s2000000-b70-l4/checkpoint-5714 were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at models/xlnet-smiles-s2000000-b70-l4/checkpoint-5714 and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN t

'\nmodel_config = XLNetConfig(\n    vocab_size=tokenizer.vocab_size,\n    n_layer=6,\n    bi_data=True,\n    num_labels=2\n)\nmodel = XLNetForSequenceClassification(model_config)\n'

In [9]:
training_args = TrainingArguments(
    f"models/xlnet-hiv",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8, # has to be even?
    per_device_eval_batch_size=30,
    num_train_epochs=10,
    save_strategy='epoch',
)

In [10]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
collator = DataCollatorWithPadding(tokenizer)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_split['train'],
    eval_dataset=ds_split['test'],
    compute_metrics=compute_metrics,
    data_collator=collator,
)

In [13]:
print(trainer.evaluate())

***** Running Evaluation *****
  Num examples = 4112
  Batch size = 30
100%|██████████| 138/138 [00:28<00:00,  4.89it/s]

{'eval_loss': 1.0527033805847168, 'eval_accuracy': 0.03526264591439689, 'eval_runtime': 28.8697, 'eval_samples_per_second': 142.433, 'eval_steps_per_second': 4.78}





In [14]:
trainer.train()

***** Running training *****
  Num examples = 37014
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 46270
  1%|          | 500/46270 [00:51<1:12:43, 10.49it/s]

{'loss': 0.1744, 'learning_rate': 1.978387724227361e-05, 'epoch': 0.11}


  2%|▏         | 1001/46270 [01:45<1:24:36,  8.92it/s]

{'loss': 0.1688, 'learning_rate': 1.9567754484547224e-05, 'epoch': 0.22}


  3%|▎         | 1243/46270 [02:09<1:10:25, 10.66it/s]