In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 1000)

# Скачиваем стоп-слова
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

# Загрузка БОЛЬШОГО датасета!
df = pd.read_csv('data/ruemotext_sample1.csv', encoding='utf-8')
df = df.dropna(subset=['text', 'emotion'])
df = df[df['emotion'].isin(['радость', 'грусть', 'злость', 'удивление'])]
print(df['emotion'].value_counts())

df['tokens'] = df['text'].apply(preprocess)

label2id = {e: i for i, e in enumerate(df['emotion'].unique())}
id2label = {i: e for e, i in label2id.items()}
df['label'] = df['emotion'].map(label2id)

# Загрузка FastText-эмбеддингов (cc.ru.300.vec)
def load_fasttext(path):
    word2vec = {}
    with open(path, 'r', encoding='utf-8') as f:
        next(f)
        for line in f:
            parts = line.rstrip().split(' ')
            word = parts[0]
            vec = np.array(list(map(float, parts[1:])))
            word2vec[word] = vec
    return word2vec

print("Загружаем FastText эмбеддинги...")
word2vec = load_fasttext('embeddings/cc.ru.300.vec')
EMB_DIM = 300

vocab = set([tok for toks in df['tokens'] for tok in toks])
word2idx = {w: i+2 for i, w in enumerate(vocab)}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1

def tokens2ids(tokens, max_len=20):
    ids = [word2idx.get(tok, word2idx['<UNK>']) for tok in tokens]
    if len(ids) < max_len:
        ids += [word2idx['<PAD>']] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

df['input_ids'] = df['tokens'].apply(tokens2ids)

# Embedding-матрица
embedding_matrix = np.zeros((len(word2idx), EMB_DIM))
for w, idx in word2idx.items():
    vec = word2vec.get(w)
    if vec is not None:
        embedding_matrix[idx] = vec
    else:
        embedding_matrix[idx] = np.random.normal(0, 1, EMB_DIM)

X = np.stack(df['input_ids'].values)
y = df['label'].values

# Разделяем на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y)

# Превращаем в тензоры
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

# Создаём DataLoader'ы
BATCH_SIZE = 256
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# LSTM-классификатор
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.embedding.weight.data.copy_(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, n_classes)
    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        h_n = h_n.squeeze(0)
        out = self.fc(h_n)
        return out

model = LSTMClassifier(len(word2idx), EMB_DIM, 128, len(label2id), embedding_matrix)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

# Обучение с DataLoader
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        out = model(X_batch)
        loss = loss_fn(out, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss {total_loss / len(train_loader):.4f}")

# Оценка на тесте
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        out = model(X_batch)
        preds = out.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.cpu().numpy())

print('Точность:', accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=[id2label[i] for i in range(len(id2label))]))

# Пример: предсказание для нового текста
def predict(text, max_len=20):
    tokens = preprocess(text)
    ids = tokens2ids(tokens, max_len)
    tensor = torch.tensor([ids], dtype=torch.long)
    with torch.no_grad():
        pred = model(tensor)
        label_id = pred.argmax(dim=1).item()
        return id2label[label_id]

print("Пример: ", predict("Это лучшая новость за сегодня!"))


[nltk_data] Error loading stopwords: <urlopen error [WinError 10060]
[nltk_data]     Попытка установить соединение была безуспешной, т.к.
[nltk_data]     от другого компьютера за требуемое время не получен
[nltk_data]     нужный отклик, или было разорвано уже установленное
[nltk_data]     соединение из-за неверного отклика уже подключенного
[nltk_data]     компьютера>


emotion
грусть       18430
злость       17305
радость      17099
удивление    15260
Name: count, dtype: int64
Загружаем FastText эмбеддинги...
Epoch 1: Loss 1.3855
Epoch 2: Loss 0.5404
Epoch 3: Loss 0.0152
Epoch 4: Loss 0.0008
Epoch 5: Loss 0.0004
Epoch 6: Loss 0.0003
Epoch 7: Loss 0.0002
Epoch 8: Loss 0.0001
Epoch 9: Loss 0.0001
Epoch 10: Loss 0.0001
Точность: 1.0
              precision    recall  f1-score   support

     радость       1.00      1.00      1.00      4275
      грусть       1.00      1.00      1.00      4608
      злость       1.00      1.00      1.00      4326
   удивление       1.00      1.00      1.00      3815

    accuracy                           1.00     17024
   macro avg       1.00      1.00      1.00     17024
weighted avg       1.00      1.00      1.00     17024

Пример:  грусть
