In [1]:
import torch
from torch import nn
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import mlflow
from os import environ

In [2]:
model_name = "sergeyzh/rubert-tiny-turbo"
model_max_length = 50   # reasonable length for my dataset
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name, model_max_length=model_max_length)
model = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sergeyzh/rubert-tiny-turbo and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load and preprocess dataset
dataset = (
    load_dataset("json", data_files="../datasets/author.json")[
        "train"
    ]
    .train_test_split(test_size=0.2)
)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Use DataCollatorWithPadding to handle different lengths of input sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/113483 [00:00<?, ? examples/s]

Map:   0%|          | 0/28371 [00:00<?, ? examples/s]

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
    'accuracy': acc,
    'f1': f1,
    'precision': precision,
    'recall': recall
    }

In [5]:
import copy

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # for using with imbalanced classes https://discuss.huggingface.co/t/how-can-i-use-class-weights-when-training/1067
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.5, 0.5]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = copy.deepcopy(control)
            self._trainer.evaluate(
                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
            )
            return control_copy

In [6]:
environ["MLFLOW_EXPERIMENT_NAME"] = "rubert-santilla-oversampled-definit-min3tokens"
environ["MLFLOW_FLATTEN_PARAMS "] = "True"
# environ["HF_MLFLOW_LOG_ARTIFACTS"] = "False"

In [7]:
torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=72,
    per_device_eval_batch_size=72,
    learning_rate=1e-4,
    num_train_epochs=7,
    weight_decay=0.05,
    warmup_ratio=0.1,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

mlflow.end_run()

  0%|          | 0/11039 [00:00<?, ?it/s]

{'loss': 0.6105, 'grad_norm': 3.525597095489502, 'learning_rate': 9.523905385002517e-05, 'epoch': 1.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.5300548672676086, 'eval_accuracy': 0.7249656339219626, 'eval_f1': 0.7239315038611096, 'eval_precision': 0.7279685766049728, 'eval_recall': 0.7249656339219626, 'eval_runtime': 4.7987, 'eval_samples_per_second': 5912.253, 'eval_steps_per_second': 82.314, 'epoch': 1.0}
{'loss': 0.4677, 'grad_norm': 7.616848468780518, 'learning_rate': 7.93658782083543e-05, 'epoch': 2.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.48414382338523865, 'eval_accuracy': 0.762962179690529, 'eval_f1': 0.7620396995758699, 'eval_precision': 0.7666545774546514, 'eval_recall': 0.762962179690529, 'eval_runtime': 4.6793, 'eval_samples_per_second': 6063.068, 'eval_steps_per_second': 84.414, 'epoch': 2.0}
{'loss': 0.3611, 'grad_norm': 10.366071701049805, 'learning_rate': 6.350276799194767e-05, 'epoch': 3.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.476492315530777, 'eval_accuracy': 0.783299848436784, 'eval_f1': 0.7831589052471983, 'eval_precision': 0.7838623345735832, 'eval_recall': 0.783299848436784, 'eval_runtime': 6.7825, 'eval_samples_per_second': 4182.965, 'eval_steps_per_second': 58.238, 'epoch': 3.0}
{'loss': 0.2876, 'grad_norm': 5.685488700866699, 'learning_rate': 4.76295923502768e-05, 'epoch': 4.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.5009725689888, 'eval_accuracy': 0.7935215536991999, 'eval_f1': 0.7931633639430523, 'eval_precision': 0.7952655497404355, 'eval_recall': 0.7935215536991999, 'eval_runtime': 4.6507, 'eval_samples_per_second': 6100.392, 'eval_steps_per_second': 84.934, 'epoch': 4.0}
{'loss': 0.2351, 'grad_norm': 7.960174083709717, 'learning_rate': 3.1766482133870155e-05, 'epoch': 5.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.5475253462791443, 'eval_accuracy': 0.798949631666138, 'eval_f1': 0.7984637843378217, 'eval_precision': 0.8014956208909574, 'eval_recall': 0.798949631666138, 'eval_runtime': 5.755, 'eval_samples_per_second': 4929.806, 'eval_steps_per_second': 68.636, 'epoch': 5.0}
{'loss': 0.1983, 'grad_norm': inf, 'learning_rate': 1.591343734272773e-05, 'epoch': 6.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.603981614112854, 'eval_accuracy': 0.8036727644425646, 'eval_f1': 0.8030118716983097, 'eval_precision': 0.8073653170324666, 'eval_recall': 0.8036727644425646, 'eval_runtime': 5.7584, 'eval_samples_per_second': 4926.873, 'eval_steps_per_second': 68.595, 'epoch': 6.0}
{'loss': 0.1738, 'grad_norm': 6.061467170715332, 'learning_rate': 4.0261701056869654e-08, 'epoch': 7.0}


  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.6404132843017578, 'eval_accuracy': 0.8033555391068344, 'eval_f1': 0.8027310640973283, 'eval_precision': 0.806821482472322, 'eval_recall': 0.8033555391068344, 'eval_runtime': 4.8606, 'eval_samples_per_second': 5836.965, 'eval_steps_per_second': 81.266, 'epoch': 7.0}
{'train_runtime': 493.9659, 'train_samples_per_second': 1608.17, 'train_steps_per_second': 22.348, 'train_loss': 0.3334335187539986, 'epoch': 7.0}


In [8]:
trainer.evaluate()

  0%|          | 0/395 [00:00<?, ?it/s]

{'eval_loss': 0.6404132843017578,
 'eval_accuracy': 0.8033555391068344,
 'eval_f1': 0.8027310640973283,
 'eval_precision': 0.806821482472322,
 'eval_recall': 0.8033555391068344,
 'eval_runtime': 4.9179,
 'eval_samples_per_second': 5768.914,
 'eval_steps_per_second': 80.319,
 'epoch': 7.0}

In [9]:
example = "если харрис изберут я реально пойду ленина читать"

model.eval()

inputs = tokenizer(example, return_tensors="pt").to('cuda')
with torch.no_grad():
    logits = model(**inputs).logits

print(nn.functional.softmax(logits, dim=1))

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

tensor([[0.0032, 0.9968]], device='cuda:0')


'POSITIVE'