In [1]:
!pip install torch --upgrade
!pip uninstall transformers accelerate -y
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install WordCloud

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import zipfile
import os

# Чтение данных
df = pd.read_csv('feedback_final.csv')
df = df[['text', 'rating', 'mood']]
df.mood = df.mood.replace(22, 2)
df['mood'] = df['mood'].replace({0: 'negative', 1: 'neutral', 2: 'positive'})
df.head()


from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch
from datasets import Dataset
from scipy.special import softmax

# Имя модели
MODEL_NAME = "avichr/heBERT_sentiment_analysis"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Разделяем данные на train (70%), validation (20%) и test (10%)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.333, random_state=42)  # 0.333 * 0.3 ≈ 0.1

# Создаем наборы данных для использования с Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Функция токенизации с учетом максимальной длины
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Токенизация данных
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Преобразование меток в числовой формат
label_mapping = {"positive": 2, "neutral": 1, "negative": 0}

def encode_labels(examples):
    examples['label'] = [label_mapping[mood] for mood in examples['mood']]
    return examples

train_dataset = train_dataset.map(encode_labels, batched=True)
val_dataset = val_dataset.map(encode_labels, batched=True)
test_dataset = test_dataset.map(encode_labels, batched=True)

# Удаление колонок, которые не нужны для обучения
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Удаление ненужной колонки
train_dataset = train_dataset.remove_columns(["mood"])
val_dataset = val_dataset.remove_columns(["mood"])
test_dataset = test_dataset.remove_columns(["mood"])


import accelerate
import transformers

print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)


from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Загрузка модели
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# Определение аргументов для обучения
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Используйте eval_strategy вместо evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Определение data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Функции для вычисления метрик
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Определение Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Обучение модели
trainer.train()

# Оценка модели на тестовом наборе данных
test_results = trainer.evaluate(test_dataset)
print(test_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/6073 [00:00<?, ? examples/s]

Map:   0%|          | 0/1736 [00:00<?, ? examples/s]

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

Map:   0%|          | 0/6073 [00:00<?, ? examples/s]

Map:   0%|          | 0/1736 [00:00<?, ? examples/s]

Map:   0%|          | 0/867 [00:00<?, ? examples/s]

Accelerate version: 0.32.1
Transformers version: 4.42.4


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.47326,0.831797,0.804877,0.838612,0.831797
2,0.512800,0.423407,0.853111,0.843163,0.838761,0.853111
3,0.334300,0.463017,0.853111,0.844717,0.839817,0.853111


{'eval_loss': 0.424294650554657, 'eval_accuracy': 0.8650519031141869, 'eval_f1': 0.8548043658241353, 'eval_precision': 0.8486523982170475, 'eval_recall': 0.8650519031141869, 'eval_runtime': 6.1125, 'eval_samples_per_second': 141.84, 'eval_steps_per_second': 8.998, 'epoch': 3.0}


In [2]:
test_results

{'eval_loss': 0.424294650554657,
 'eval_accuracy': 0.8650519031141869,
 'eval_f1': 0.8548043658241353,
 'eval_precision': 0.8486523982170475,
 'eval_recall': 0.8650519031141869,
 'eval_runtime': 6.1125,
 'eval_samples_per_second': 141.84,
 'eval_steps_per_second': 8.998,
 'epoch': 3.0}