In [1]:
%pip -q install transformers datasets huggingface_hub evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
token = user_secrets.get_secret("huggingface")

In [4]:
from huggingface_hub import login

login(token=token)

In [93]:
load_accuracy = evaluate.load("accuracy")
load_f1 = evaluate.load("f1")
load_precision = evaluate.load("precision")
def compute_metrics(eval_pred):
    print(eval_pred)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    # f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    precision = load_precision.compute(predictions=predictions, references=labels, average='weighted')["precision"]
    return {"accuracy": accuracy, "precision": precision}

In [94]:
dataset = load_dataset("MonoHime/ru_sentiment_dataset").rename_column("sentiment", "label")
train = dataset["train"]
test = dataset["validation"]

In [95]:
train

Dataset({
    features: ['Unnamed: 0', 'text', 'label'],
    num_rows: 189891
})

In [96]:
model_name = "cointegrated/rubert-tiny"
out_name = "rubert-tiny-sentiment-analisys"

In [97]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [98]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, device_map="cuda")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
tokenized_train = train.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
tokenized_test = test.map(lambda x: tokenizer(x["text"], truncation=True), batched=True)

In [100]:
tokenized_train

Dataset({
    features: ['Unnamed: 0', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 189891
})

In [101]:
# tokenized_train = tokenized_train.select([i for i in list(range(300))])
# tokenized_test = tokenized_test.select([i for i in list(range(300))])

In [102]:
tokenized_train

Dataset({
    features: ['Unnamed: 0', 'text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300
})

In [103]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [104]:
train_args = TrainingArguments(
    output_dir=out_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
    disable_tqdm=False,
    logging_steps=1000,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [105]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=38, training_loss=1.032115333958676, metrics={'train_runtime': 5.7038, 'train_samples_per_second': 105.192, 'train_steps_per_second': 6.662, 'total_flos': 4039690933944.0, 'train_loss': 1.032115333958676, 'epoch': 2.0})

In [106]:
trainer.evaluate()

<transformers.trainer_utils.EvalPrediction object at 0x7e6b8e0546a0>


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.059209942817688,
 'eval_accuracy': 0.45666666666666667,
 'eval_precision': 0.46719239373601795,
 'eval_runtime': 1.7217,
 'eval_samples_per_second': 174.247,
 'eval_steps_per_second': 11.036,
 'epoch': 2.0}