In [None]:
%pip install transformers datasets safetensors wandb
for pkg in ["transformers", "datasets", "wandb", "scikit-learn", "tqdm"]:
    %pip install -U {pkg}
%pwd


In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DataCollatorWithPadding, AutoConfig
from datasets import load_dataset

In [None]:
# settings
checkpoint = "distilbert-base-uncased"

# model and tokenizer
model_config = AutoConfig.from_pretrained(checkpoint, num_labels=50)
model_config.dropout=0.1
model_config.n_layers=18

model = DistilBertForSequenceClassification.from_pretrained(checkpoint, config=model_config)
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)



# prepare datasets
## retrive raw data
print("\n====> Loading raw dataset from huggingface...\n")
raw_datasets = load_dataset("trec")
raw_datasets = raw_datasets.rename_column("fine_label", "labels")
raw_datasets = raw_datasets.remove_columns("coarse_label")
print(raw_datasets)
print(raw_datasets['train'].features)
print(raw_datasets['train'][0])
## tokenize
print("\n====> Tokenizing...\n")
def tokenizer_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = raw_datasets.map(tokenizer_function, batched=True)
print(tokenized_datasets)
print(tokenized_datasets['train'].features)
print(tokenized_datasets['train'][0])
## init dynamic padding data collector
print("\n====> Tokenizing...\n")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
model.config

In [None]:
# training settings
from torch import dropout
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
import wandb

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) 
    f1 = f1_score(y_true=labels, y_pred=predictions, average="macro")
    precision = precision_score(y_true=labels, y_pred=predictions, average="macro")
    recall = recall_score(y_true=labels, y_pred=predictions, average="macro")
    return {
        "accuracy": (predictions == labels).astype(np.float32).mean().item(),
        "f1": f1.astype(np.float32).astype(np.float32).mean().item(),
        "precision": precision.astype(np.float32).mean().item(),
        "recall": recall.astype(np.float32).mean().item(),
        }

train_args = TrainingArguments(
    run_name="hl18_lr2e-5_bs16_dr0.1",
    num_train_epochs=5.0,
    learning_rate=2e-5,
    weight_decay=0,
    per_device_train_batch_size=16, # trainig batch size
    per_device_eval_batch_size=16, # evaluation batch size
    optim="adamw_hf",
    save_safetensors=False,
    data_seed=417,
    seed=417,
    save_strategy="no",
    output_dir="./outputs",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    # evaluation_strategy="steps",
    # eval_steps=600,
    # logging_strategy="steps",
    # logging_steps=600,
    report_to="wandb",
)

trainer = Trainer(
    model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# wandb.login()
# !wandb sync --clean
torch.cuda.empty_cache()
trainer.train()



In [None]:
wandb.finish()
# evaluation
preds, labels, metrics = trainer.predict(tokenized_datasets["test"])
print(preds.shape, labels.shape)

print(compute_metrics((preds, labels)))

# %rm -rf wandb