In [1]:
!pip install datasets transformers evaluate



In [3]:
from datasets import load_dataset, Dataset
from typing import List, Tuple
import evaluate
import numpy as np
from sklearn.metrics import classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from transformers import pipeline

MODEL_NAME = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
DATASET_NAME = 'Davlan/sib200'
DATASET_LANGUAGE = 'rus_Cyrl'
train_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='train')
validation_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='validation')
test_set = load_dataset(DATASET_NAME, DATASET_LANGUAGE, split='test')
print(train_set)

Dataset({
    features: ['index_id', 'category', 'text'],
    num_rows: 701
})


In [4]:
MINIBATCH_SIZE = 32
tokenized_train_set = train_set.map(lambda it: tokenizer(it['text'], truncation=True, padding='max_length', max_length=128), batched=True)
tokenized_validation_set = validation_set.map(lambda it: tokenizer(it['text'], truncation=True, padding='max_length', max_length=128), batched=True)
print(tokenized_train_set)

Dataset({
    features: ['index_id', 'category', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 701
})


In [5]:
cls_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return cls_metric.compute(predictions=predictions, references=labels, average='macro')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

list_of_categories = sorted(set(train_set['category']) | set(validation_set['category']) | set(test_set['category']))
indices_of_categories = list(range(len(list_of_categories)))
n_categories = len(list_of_categories)
id2label = dict(zip(indices_of_categories, list_of_categories))
label2id = dict(zip(list_of_categories, indices_of_categories))

labeled_train_set = tokenized_train_set.add_column('label', [label2id[val] for val in tokenized_train_set['category']])
labeled_validation_set = tokenized_validation_set.add_column('label', [label2id[val] for val in tokenized_validation_set['category']])
print(labeled_train_set)

Dataset({
    features: ['index_id', 'category', 'text', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 701
})


In [6]:
!pip install protobuf==4.25.5 wandb==0.18.6



In [7]:
classifier = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=n_categories, id2label=id2label, label2id=label2id).cuda()

for param in classifier.parameters():
    param.data = param.data.contiguous()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir='rubert_sib200',
    learning_rate=5e-5,
    per_device_train_batch_size=MINIBATCH_SIZE,
    per_device_eval_batch_size=MINIBATCH_SIZE,
    num_train_epochs=25,  # увеличил
    weight_decay=1e-2,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True #добавил
)



In [9]:
pseudo_labeling_model_name = "cointegrated/rut5-base"
pseudo_labeling_model = AutoModelForSequenceClassification.from_pretrained(pseudo_labeling_model_name).cuda()
pseudo_labeling_tokenizer = AutoTokenizer.from_pretrained(pseudo_labeling_model_name)

def apply_pseudo_labeling(unlabeled_dataset, threshold=0.8):
    classification_pipeline = pipeline(
        'text-classification',
        model=pseudo_labeling_model,
        tokenizer=pseudo_labeling_tokenizer,
        device=0
    )

    texts = [f"текст: {text}" for text in unlabeled_dataset['text']]
    predictions = classification_pipeline(texts)

    pseudo_labels = []
    for prediction in predictions:
        if prediction['score'] >= threshold:
            pseudo_labels.append(label2id.get(prediction['label'], -1))
        else:
            pseudo_labels.append(-1)

    pseudo_labeled_dataset = unlabeled_dataset.add_column('label', pseudo_labels)
    pseudo_labeled_dataset = pseudo_labeled_dataset.filter(lambda example: example['label'] != -1)
    return pseudo_labeled_dataset

pseudo_labeled_dataset = apply_pseudo_labeling(test_set)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at cointegrated/rut5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=labeled_train_set,
    eval_dataset=labeled_validation_set,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msnytkinaleksandr2[0m ([33msnytkinaleksandr2-ss[0m). Use [1m`wandb login --relogin`[0m to force relogin


ClearML Task: created new task id=a38fa820031147138eee31e1bafef7cd
2024-11-18 20:22:49,037 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/9e8c4b01a416417dbbc804834e1e6868/experiments/a38fa820031147138eee31e1bafef7cd/output/log


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.765076,0.796836
2,No log,0.528188,0.818139
3,No log,0.558542,0.839518


2024-11-18 20:23:30,870 - clearml.storage - INFO - Starting upload: C:\Users\snytk\AppData\Local\Temp\model_package.al1irx4q.zip => https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-22.zip
2024-11-18 20:23:40,907 - clearml.storage - INFO - Uploading: 2040.62MB to C:\Users\snytk\AppData\Local\Temp\model_package.al1irx4q.zip


▏                             1% | 16.56/2040.62 MB [00:40<1:29:17,  2.65s/MB]: 

2024-11-18 20:24:24,283 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


▎                             1% | 26.84/2040.62 MB [01:10<1:36:16,  2.87s/MB]: 

2024-11-18 20:24:54,287 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


▍                             2% | 37.45/2040.62 MB [01:40<1:35:04,  2.85s/MB]: 

2024-11-18 20:25:24,289 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


▋                             2% | 47.48/2040.62 MB [02:10<1:37:42,  2.94s/MB]: 

2024-11-18 20:25:54,291 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


▊                             3% | 57.88/2040.62 MB [02:40<1:35:39,  2.89s/MB]: 

2024-11-18 20:26:24,292 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


▉                             3% | 68.05/2040.62 MB [03:10<1:35:31,  2.91s/MB]: 

2024-11-18 20:26:54,297 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█                             4% | 78.69/2040.62 MB [03:41<1:33:06,  2.85s/MB]: 

2024-11-18 20:27:24,301 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▏                            4% | 89.05/2040.62 MB [04:11<1:33:01,  2.86s/MB]: 

2024-11-18 20:27:54,305 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▎                            5% | 99.89/2040.62 MB [04:41<1:31:09,  2.82s/MB]: 

2024-11-18 20:28:24,307 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▍                           5% | 110.27/2040.62 MB [05:11<1:32:23,  2.87s/MB]: 

2024-11-18 20:28:54,309 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▌                           6% | 120.34/2040.62 MB [05:41<1:34:35,  2.96s/MB]: 

2024-11-18 20:29:24,311 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▋                           6% | 131.05/2040.62 MB [06:11<1:30:37,  2.85s/MB]: 

2024-11-18 20:29:54,314 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▊                           7% | 141.95/2040.62 MB [06:42<1:28:24,  2.79s/MB]: 

2024-11-18 20:30:24,319 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


█▉                           7% | 152.78/2040.62 MB [07:12<1:27:44,  2.79s/MB]: 

2024-11-18 20:30:54,335 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


██                           8% | 163.47/2040.62 MB [07:42<1:27:51,  2.81s/MB]: 

2024-11-18 20:31:24,338 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


██▏                          9% | 174.00/2040.62 MB [08:12<1:28:46,  2.85s/MB]: 

2024-11-18 20:31:54,341 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


██▎                          9% | 179.39/2040.62 MB [08:33<1:45:37,  3.40s/MB]: 

2024-11-18 20:32:24,343 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, https://files.clear.ml/HuggingFace Transformers/Trainer.a38fa820031147138eee31e1bafef7cd/models/checkpoint-66.zip)


██▎                          9% | 179.47/2040.62 MB [08:43<2:28:37,  4.79s/MB]: 

In [None]:
classifiсation_pipeline = pipeline('text-classification', model=classifier, tokenizer=tokenizer, device=0)

y_pred = list(map(lambda x: x['label'], classifiсation_pipeline(validation_set['text'])))
y_true = validation_set['category']
print(classification_report(y_true=y_true, y_pred=y_pred))

y_pred = list(map(lambda x: x['label'], classifiсation_pipeline(test_set['text'])))
y_true = test_set['category']
print(classification_report(y_true=y_true, y_pred=y_pred))