In [13]:
import wandb

wandb.init(project="advanced_ai_imdb_dataset", name="alberta-test-run")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Asus\.netrc


In [14]:
from datasets import load_dataset


train_set = load_dataset("imdb", split="train")

In [15]:
test_set = load_dataset("imdb", split="test")
val_test = test_set.train_test_split(test_size=0.8, seed=42, stratify_by_column="label")

In [16]:
print(train_set)
print(val_test)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
})


In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2")

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def preprocess_token(example):
    return tokenizer(
        example["text"], padding="max_length", truncation=True, return_tensors="pt"
    )

In [19]:
tokenized_train = train_set.map(preprocess_token, batched=True)
tokenized_val = val_test["train"].map(preprocess_token, batched=True)
tokenized_test = val_test["test"].map(preprocess_token, batched=True)

Map: 100%|██████████| 25000/25000 [00:12<00:00, 2070.83 examples/s]
Map: 100%|██████████| 5000/5000 [00:02<00:00, 1778.26 examples/s]
Map: 100%|██████████| 20000/20000 [00:11<00:00, 1688.44 examples/s]


In [20]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Downloading builder script: 100%|██████████| 6.77k/6.77k [00:00<?, ?B/s]
Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 14.4MB/s]
Downloading builder script: 100%|██████████| 7.36k/7.36k [00:00<00:00, 7.42MB/s]


In [22]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels),
        "precision": precision.compute(predictions=predictions, references=labels),
        "recall": recall.compute(predictions=predictions, references=labels),
    }

In [23]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [24]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="albert_imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    eval_steps=500,
    save_total_limit=1,
    report_to="wandb",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [25]:
trainer.train()



In [None]:
wandb.finish()