In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
import pandas as pd
import numpy as np
import json
import time

# Data loading
df = pd.read_csv('emotions_normalized_dataset.csv', header=None, names=['text', 'emotions'], skiprows=1)

def normalize_emotions(emotion_str):
    if pd.isna(emotion_str) or emotion_str.strip() == "":
        return []
    emotion_str = emotion_str.strip('"').replace("'", '"')
    return json.loads(emotion_str)

df['emotions'] = df['emotions'].apply(normalize_emotions)

# Select the primary emotion as the first one in the list with the highest frequency across the dataset

primary_emotions = ['қайғы', 'қуаныш', 'уайым', 'мақтаныш', 'нейтрал', 'таңқалу']
df['primary_emotion'] = df['emotions'].apply(lambda x: [x[0]] if x else ['нейтрал'])
df['secondary_emotions'] = df['emotions'].apply(lambda x: x[1:] if len(x) > 1 else [])

mlb_primary = MultiLabelBinarizer()
mlb_secondary = MultiLabelBinarizer()
df_primary = pd.DataFrame(mlb_primary.fit_transform(df['primary_emotion']), columns=mlb_primary.classes_)
df_secondary = pd.DataFrame(mlb_secondary.fit_transform(df['secondary_emotions']), columns=mlb_secondary.classes_)
df = pd.concat([df['text'], df_primary, df_secondary], axis=1)

train_texts, val_texts, train_labels_primary, val_labels_primary = train_test_split(
    df['text'], df_primary.values, test_size=0.2, random_state=42
)
train_labels_secondary, val_labels_secondary = train_test_split(
    df_secondary.values, test_size=0.2, random_state=42
)

#  Class Dataset
class EmotionDataset(Dataset):
    def __init__(self, texts, labels_primary, labels_secondary, tokenizer, max_len):
        self.texts = texts
        self.labels_primary = labels_primary
        self.labels_secondary = labels_secondary
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        labels_primary = torch.tensor(self.labels_primary[idx], dtype=torch.float32)
        labels_secondary = torch.tensor(self.labels_secondary[idx], dtype=torch.float32)
        encoding = self.tokenizer(text, max_length=self.max_len, 
                                  padding='max_length', 
                                  truncation=True, 
                                  return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels_primary': labels_primary,
            'labels_secondary': labels_secondary
        }

# tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_dataset = EmotionDataset(train_texts, train_labels_primary, train_labels_secondary, tokenizer, max_len=128)
val_dataset = EmotionDataset(val_texts, val_labels_primary, val_labels_secondary, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


model_primary = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', 
                                                              num_labels=len(mlb_primary.classes_))
model_secondary = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
                                                                num_labels=len(mlb_secondary.classes_))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_primary.to(device)
model_secondary.to(device)

optimizer_primary = AdamW(model_primary.parameters(), lr=2e-5)
optimizer_secondary = AdamW(model_secondary.parameters(), lr=2e-5)

# Model training
def train_model(model, train_loader, val_loader, optimizer, device, epochs=10, is_primary=True):
    for epoch in range(epochs):
        start_time = time.time()
        model.train()
        train_loss, train_correct, total_train = 0, 0, 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            labels = batch['labels_primary' if is_primary else 'labels_secondary'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
            train_correct += torch.sum(predictions == labels).item()
            total_train += labels.numel()
        
        train_acc = train_correct / total_train
        val_loss, val_acc = evaluate_model(model, val_loader, device, is_primary)
        epoch_time = time.time() - start_time
        print(f"Эпоха {epoch + 1}/{epochs}:\n  Train Loss: {train_loss / len(train_loader):.4f},
        Val Loss: {val_loss:.4f}\n  Train Accuracy: {train_acc:.4f}, 
        Val Accuracy: {val_acc:.4f}\n  Эпоха уақыты: {epoch_time:.2f} сек")

# Evaluation function
def evaluate_model(model, val_loader, device, is_primary=True):
    model.eval()
    val_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            labels = batch['labels_primary' if is_primary else 'labels_secondary'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            predictions = (torch.sigmoid(outputs.logits) > 0.5).float()
            correct += torch.sum(predictions == labels).item()
            total += labels.numel()
    return val_loss / len(val_loader), correct / total

# Start training (10 epochs for each classifier)
train_model(model_primary, train_loader, val_loader, optimizer_primary, device, epochs=10, is_primary=True)
train_model(model_secondary, train_loader, val_loader, optimizer_secondary, device, epochs=10, is_primary=False)

# Output predicted sentences
print("Predictions:")
for i in range(10):
    print(f"Text: {val_texts.iloc[i]}")
    print(f"Predicted primary emotion: {mlb_primary.inverse_transform(np.array([val_labels_primary[i]]))}")
    print(f"Predicted secondary emotion: {mlb_secondary.inverse_transform(np.array([val_labels_secondary[i]]))}")
    print("-" * 50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Эпоха 1/10:
  Train Loss: 0.5225, Val Loss: 0.4444
  Train Accuracy: 0.7894, Val Accuracy: 0.8333
  Время эпохи: 353.44 сек
Эпоха 2/10:
  Train Loss: 0.4374, Val Loss: 0.4170
  Train Accuracy: 0.8333, Val Accuracy: 0.8333
  Время эпохи: 302.57 сек
Эпоха 3/10:
  Train Loss: 0.4069, Val Loss: 0.3805
  Train Accuracy: 0.8376, Val Accuracy: 0.8529
  Время эпохи: 301.23 сек
Эпоха 4/10:
  Train Loss: 0.3813, Val Loss: 0.3788
  Train Accuracy: 0.8486, Val Accuracy: 0.8490
  Время эпохи: 313.45 сек
Эпоха 5/10:
  Train Loss: 0.3560, Val Loss: 0.3548
  Train Accuracy: 0.8594, Val Accuracy: 0.8490
  Время эпохи: 304.31 сек
Эпоха 6/10:
  Train Loss: 0.3123, Val Loss: 0.3944
  Train Accuracy: 0.8786, Val Accuracy: 0.8503
  Время эпохи: 301.03 сек
Эпоха 7/10:
  Train Loss: 0.2927, Val Loss: 0.3516
  Train Accuracy: 0.8870, Val Accuracy: 0.8490
  Время эпохи: 305.17 сек
Эпоха 8/10:
  Train Loss: 0.2575, Val Loss: 0.3681
  Train Accuracy: 0.9098, Val Accuracy: 0.8320
  Время эпохи: 300.86 сек
Эпоха 9/

In [3]:
# Saving the models
model_primary.save_pretrained("model_primary")
model_secondary.save_pretrained("model_secondary")
tokenizer.save_pretrained("tokenizer")

print("Модели успешно сохранены!")

Модели успешно сохранены!


In [7]:
from transformers import BertForSequenceClassification, BertTokenizer

# Указываем полный путь к локальной папке с моделью
model_primary = BertForSequenceClassification.from_pretrained(r"C:\Users\asus\Дипломдык жумыс\модель оқыту\model_primary")
model_secondary = BertForSequenceClassification.from_pretrained(r"C:\Users\asus\Дипломдык жумыс\модель оқыту\model_secondary")

# Если токенизатор нужен
tokenizer = BertTokenizer.from_pretrained(r"C:\Users\asus\Дипломдык жумыс\модель оқыту\tokenizer")

print("Модели успешно загружены!")

Модели успешно загружены!
