In [1]:
import torch
import numpy as np
from datasets import list_datasets, load_dataset, list_metrics, load_metric
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import os

In [2]:
def target_offset(examples):
    examples["label"] = list(map(lambda x: x - 1, examples["label"]))
    return examples

def load_data(name):

    if name == "imdb":
        dataset = load_dataset("imdb", ignore_verifications=True)
        num_labels = 2

    dataset = dataset.shuffle(seed=0)
    
    return dataset, num_labels

In [3]:
FALSY_STRINGS = {'off', 'false', '0'}
TRUTHY_STRINGS = {'on', 'true', '1'}

def bool_flag(s):
    """
    Parse boolean arguments from the command line.
    """
    if s.lower() in FALSY_STRINGS:
        return False
    elif s.lower() in TRUTHY_STRINGS:
        return True
    else:
        raise argparse.ArgumentTypeError("invalid value for a boolean flag")

In [4]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if type(predictions) == tuple:
        predictions = predictions[0]
    predictions = np.argmax(predictions, axis=1)
    acc = np.mean(predictions == labels)
    return {
        'accuracy': acc
    }

In [5]:
dataset, num_labels = load_data('imdb')



In [6]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained('gpt2', num_labels=num_labels)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [8]:
text_key = 'text' 
testset_key = 'test' 
preprocess_function = lambda examples: tokenizer(examples[text_key], max_length=256, truncation=True)

In [9]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
 train_args = TrainingArguments(
        "./checkpoint/",
        disable_tqdm = False,
        evaluation_strategy = "epoch",
        learning_rate = 2e-5,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        num_train_epochs = 5,
        weight_decay = 0.01,
#         load_best_model_at_end = True,
        metric_for_best_model="accuracy",
    )

In [15]:
trainer = Trainer(
        model,
        train_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset[testset_key],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2422,0.237511,0.91468
2,0.1952,0.230315,0.92164
3,0.161,0.2598,0.92344
4,0.1319,0.30377,0.92412
5,0.108,0.332518,0.92452


TrainOutput(global_step=7815, training_loss=0.16937652290134345, metrics={'train_runtime': 2083.2251, 'train_samples_per_second': 60.003, 'train_steps_per_second': 3.751, 'total_flos': 1.6330916263624704e+16, 'train_loss': 0.16937652290134345, 'epoch': 5.0})

In [None]:
trainer.evaluate()

In [21]:
suffix = ''
suffix += '_finetune'
torch.save(model.state_dict(),
           os.path.join("./results/", "%s_%s%s.pth" % ("gpt2".replace('/', '-'), "imdb", suffix)))