In [None]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import os
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from peft import get_peft_model
from peft import LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import evaluate
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import DatasetDict, Dataset, load_dataset

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

'cuda'

In [None]:
sentiment_df = pd.read_csv("rusentiment_test.csv")

In [None]:
sentiment_df = sentiment_df[sentiment_df["label"].isin(["positive", "negative", "neutral"])]

In [None]:
splits = {'train': 'train.csv', 'validation': 'valid.csv'}
df = pd.read_csv("hf://datasets/MonoHime/ru_sentiment_dataset/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
df.columns = ["text", "label"]
df.head()

Unnamed: 0,text,label
0,".с.,и спросил его: о Посланник Аллаха!Ты пори...",1
1,Роднее всех родных Попала я в ГКБ №8 еще в дек...,1
2,Непорядочное отношение к своим работникам Рабо...,2
3,"). Отсутствуют нормативы, Госты и прочее, что ...",1
4,У меня машина в руках 5 лет и это п...,1


In [None]:
sentiment_df["label"] = LabelEncoder().fit_transform(sentiment_df["label"])

In [None]:
train_df, val_df = train_test_split(sentiment_df)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(val_df),
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rubert-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", max_length=512,  truncation=True)

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True
)

NameError: name 'dataset' is not defined

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",  # Паддинг до самой длинной последовательности в батче
)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "ai-forever/rubert-large",
    num_labels=3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/rubert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
metric = evaluate.load("precision")

In [None]:
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels, average="macro")

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=50,
    weight_decay=0.01,
    warmup_ratio=0.1,  # 10% шагов на разогрев
    fp16=True,  # Для экономии памяти (если GPU поддерживает)
    logging_steps=50,
    per_device_train_batch_size=8,
    learning_rate=2e-5
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
tokenized_datasets["train"]

Dataset({
    features: ['label', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1660
})

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


## На нормальных датасетах обучение

In [None]:
first_dataset = pd.read_parquet("hf://datasets/Megnis/RuSentimentUnion-9000/data/train-00000-of-00001.parquet")
second_dataset = pd.read_parquet("hf://datasets/Megnis/ru_sentiment_dataset-20000/data/train-00000-of-00001.parquet")
third_dataset = pd.read_parquet("hf://datasets/Megnis/AI-Learning-Lab-dataset-clean-800/data/train-00000-of-00001.parquet")
val_dataset = pd.read_parquet("hf://datasets/Megnis/AI-Learning-Lab-dataset-clean-35/data/train-00000-of-00001.parquet")
val_100_dataset = pd.read_parquet("hf://datasets/Megnis/AI-Learning-Lab-dataset-clean-100/data/train-00000-of-00001.parquet")

In [None]:
second_dataset.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
second_dataset.columns = ["text", "label"]

In [None]:
third_dataset.drop(columns=["UserSenderId", "SubmitDate"], inplace=True)
third_dataset.columns = ["label", "text"]

In [None]:
val_dataset.drop(columns=["UserSenderId", "SubmitDate"], inplace=True)
val_dataset.columns = ["label", "text"]

In [None]:
val_100_dataset.drop(columns=["UserSe0derId", "Su2mitDate"], inplace=True)
val_100_dataset.columns = ["text", "label"]

In [None]:
first_dataset_train, first_dataset_val = train_test_split(first_dataset)
second_dataset_train, second_dataset_val = train_test_split(second_dataset)
third_dataset_train, third_dataset_val = train_test_split(third_dataset)

In [None]:
val_big_dataset = pd.concat([first_dataset_val, second_dataset_val])

In [None]:
# first_dataset_train["weight"] = 1
# second_dataset_train["weight"] = 1
# third_dataset_train["weight"] = 3

In [None]:
df = pd.concat([first_dataset_train, second_dataset_train, third_dataset_train])

In [None]:
df.shape

(22535, 2)

In [None]:
df = df.sample(5000)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(df),
    "test_small": Dataset.from_pandas(val_dataset),
    "test_big": Dataset.from_pandas(val_big_dataset),
    "test_800": Dataset.from_pandas(third_dataset_train),
    "test_100_val": Dataset.from_pandas(val_100_dataset)
})

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/7303 [00:00<?, ? examples/s]

Map:   0%|          | 0/626 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest",  # Паддинг до самой длинной последовательности в батче
)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "ai-forever/rubert-large",
    num_labels=3
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/rubert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    accuracy = accuracy_score(labels, predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions.argmax(-1), average="macro"
    )
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

In [None]:
class WeightedTrainer(Trainer):
    def get_train_dataloader(self):
        return get_weighted_train_dataloader(self.train_dataset)

In [None]:
def get_weighted_train_dataloader(train_dataset):
    weights = np.array(train_dataset["weight"])
    sampler = WeightedRandomSampler(
        weights=weights,
        num_samples=len(weights),
        replacement=True
    )

    return DataLoader(
        train_dataset,
        sampler=sampler,
        batch_size=training_args.per_device_train_batch_size,
        collate_fn=data_collator,
    )

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    num_train_epochs=50,
    weight_decay=0.01,
    warmup_ratio=0.1,  # 10% шагов на разогрев
    fp16=True,  # Для экономии памяти (если GPU поддерживает)
    logging_steps=50,
    per_device_train_batch_size=16,
    learning_rate=2e-4
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset={
        "val_small": tokenized_datasets["test_small"],
        "val_big": tokenized_datasets["test_big"],
        "val_800": tokenized_datasets["test_800"],
        "val_100": tokenized_datasets["test_100_val"]
    },
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Val Small Loss,Val Small Accuracy,Val Small F1,Val Small Precision,Val Small Recall,Val Big Loss,Val Big Accuracy,Val Big F1,Val Big Precision,Val Big Recall,Val 800 Loss,Val 800 Accuracy,Val 800 F1,Val 800 Precision,Val 800 Recall,Val 100 Loss,Val 100 Accuracy,Val 100 F1,Val 100 Precision,Val 100 Recall
1,1.0497,No log,0.98933,0.5,0.490476,0.605556,0.608586,1.022571,0.491031,0.474538,0.516849,0.487817,1.020585,0.448882,0.424793,0.555784,0.505617,1.035942,0.43,0.403501,0.494048,0.442678


Epoch,Training Loss,Validation Loss,Val Small Loss,Val Small Accuracy,Val Small F1,Val Small Precision,Val Small Recall,Val Big Loss,Val Big Accuracy,Val Big F1,Val Big Precision,Val Big Recall,Val 800 Loss,Val 800 Accuracy,Val 800 F1,Val 800 Precision,Val 800 Recall,Val 100 Loss,Val 100 Accuracy,Val 100 F1,Val 100 Precision,Val 100 Recall
1,1.0497,No log,0.98933,0.5,0.490476,0.605556,0.608586,1.022571,0.491031,0.474538,0.516849,0.487817,1.020585,0.448882,0.424793,0.555784,0.505617,1.035942,0.430000,0.403501,0.494048,0.442678
2,0.7294,No log,0.547624,0.705882,0.566912,0.597222,0.576263,0.657859,0.703410,0.702704,0.716296,0.703997,0.531115,0.803514,0.807661,0.814413,0.804560,0.562107,0.740000,0.625724,0.636450,0.641473
3,0.5999,No log,0.447119,0.735294,0.595195,0.614286,0.606566,0.611297,0.726003,0.720569,0.722139,0.722779,0.525126,0.785942,0.787480,0.797405,0.782512,0.470868,0.810000,0.726505,0.756137,0.726205
4,0.5473,No log,0.587324,0.705882,0.596888,0.659259,0.657071,0.592153,0.742161,0.740530,0.740640,0.740500,0.524697,0.798722,0.794597,0.830620,0.781216,0.537268,0.760000,0.675397,0.740391,0.693172
5,0.4798,No log,0.4592,0.764706,0.663031,0.699074,0.717677,0.631299,0.732713,0.726808,0.729850,0.729371,0.484475,0.814696,0.815746,0.825341,0.809370,0.557399,0.750000,0.692689,0.712454,0.704201
6,0.4427,No log,0.54908,0.823529,0.703734,0.720238,0.75101,0.728326,0.722854,0.719238,0.734472,0.720448,0.563378,0.806709,0.803864,0.832524,0.792815,0.641735,0.790000,0.698858,0.814889,0.700619
7,0.3756,No log,0.720715,0.735294,0.631579,0.674603,0.687374,0.679191,0.740791,0.736803,0.737507,0.738145,0.462425,0.840256,0.843226,0.852532,0.837130,0.732509,0.750000,0.692121,0.730000,0.711310
8,0.2906,No log,0.842543,0.764706,0.615789,0.640212,0.623232,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log,No Log


KeyboardInterrupt: 