In [5]:
%pip install -U "transformers>=4.30" "accelerate>=0.26" torch datasets scikit-learn numpy pandas

Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict


In [7]:
name = "DeepPavlov/rubert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(name,
        id2label = { 0: "0", 1: "1", 2: "2", 3: "3"},
        label2id = { "0": 0, "1": 1, "2": 2, "3": 3},           
    )
tok = AutoTokenizer.from_pretrained(name, model_max_length=512)
train = pd.read_csv("../data/output/train.csv")
train = train.rename(columns={"type": "labels"})
test = pd.read_csv("../data/output/test.csv")
test = train.rename(columns={"type": "labels"})
data = DatasetDict({
    "train":  Dataset.from_pandas(train[["text","labels"]]),
    "test": Dataset.from_pandas(test[["text","labels"]])
})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocessing(raw):
    batch = tok (raw["text"],
        truncation=True,
        max_length=512,
        padding=False 
    )
    return batch

In [9]:
tokenized_dataset = data.map(preprocessing, batched = True)

Map: 100%|██████████| 14962/14962 [00:01<00:00, 8881.15 examples/s]
Map: 100%|██████████| 14962/14962 [00:00<00:00, 24642.97 examples/s]


In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tok)

In [11]:
import transformers
print("Transformers version:", transformers.__version__)

Transformers version: 4.57.1


In [19]:
args = TrainingArguments(
    output_dir = "../data/output",
    eval_strategy = "epoch",
    save_strategy="epoch",
    logging_strategy="steps", 
    logging_steps=20, 
    logging_first_step = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    num_train_epochs = 3,
    weight_decay = 0.01,
    load_best_model_at_end = True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis = -1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

In [21]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tok,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
preds = trainer.predict(tokenized_dataset["test"])
pred_labels = preds.predictions.argmax(axis = -1)
pred_labels