In [5]:
import pandas as pd
import os


df = pd.read_csv("lenta-ru-news-extended.csv")

text_column = df.iloc[:, 2]  
topic_column = df.iloc[:, 3]  

print("Пример текста с темой:")
print(f"Тема: {topic_column[3]}")
print(f"Текст: {text_column[2][:300]}...") 
print("\n" + "-" * 50 + "\n")

total_samples = len(df)
print(f"Общее количество текстов: {total_samples}")

miss_topic = topic_column.isnull().sum()
if miss_topic == 0:
    print("Все тексты имеют тему.")
else:
    print(f"Количество текстов без темы: {miss_topic}")

unique_topics = topic_column.unique()
num_topic = len(unique_topics)
print(f"\nКоличество уникальных тем: {num_topic}")
print("Список уникальных тем:")
print(unique_topics)

class_distribution = topic_column.value_counts()
print("\nРаспределение по классам:")
print(class_distribution)

class_distribution.to_csv("class_distribution.csv", index=True, header=["count"], encoding="utf-8")

Пример текста с темой:
Тема: Библиотека
Текст: Штабс-капитан П. Н. Нестеров на днях, увидев в районе Желтиева, в Галиции, летящий над нашим расположением австрийский аэроплан, собиравшийся бросить бомбы, взлетел на воздух, атаковал неприятеля и протаранил неприятельский аппарат, предотвратив жертвы в наших войсках. Сам Нестеров при этом погиб см...

--------------------------------------------------

Общее количество текстов: 803938
Количество текстов без темы: 62002

Количество уникальных тем: 35
Список уникальных тем:
['Библиотека' 'Россия' 'Мир' 'Экономика' 'Интернет и СМИ' 'Спорт'
 'Культура' 'Из жизни' 'Силовые структуры' 'Наука и техника' 'Бывший СССР'
 nan 'Дом' 'Сочи' 'ЧМ-2014' 'Путешествия' 'Ценности' 'Легпром' 'Бизнес'
 'МедНовости' 'Оружие' '69-я параллель' 'Культпросвет ' 'Крым' 'topic'
 'tech' 'politics' 'business' 'entertainment' 'sport' 'cricket' 'tennis'
 'football' 'rugby' 'athletics']

Распределение по классам:
topic
Россия               160445
Мир                  13

In [6]:
import pandas as pd

df = pd.read_csv("lenta-ru-news-extended.csv")
min_samples_per_class = 5000

df = df.dropna(subset=[df.columns[2], df.columns[3]])
class_counts = df.iloc[:, 3].value_counts()
valid_classes = class_counts[class_counts >= min_samples_per_class].index
df = df[df.iloc[:, 3].isin(valid_classes)]
df = df[df.iloc[:, 3] != 'nan']
df = df[df.iloc[:, 3].astype(str) != 'nan']

text_column = df.iloc[:, 2]
topic_column = df.iloc[:, 3]

df.iloc[:, 3] = df.iloc[:, 3].astype(str).str.strip()
df = df.dropna(subset=[df.columns[3]])

class_counts = df.iloc[:, 3].value_counts()
valid_classes = class_counts[class_counts >= min_samples_per_class].index
df = df[df.iloc[:, 3].isin(valid_classes)]

text_column = df.iloc[:, 2]
topic_column = df.iloc[:, 3]

print("Пример текста с темой:")
print(f"Тема: {topic_column.iloc[0]}")
print(f"Текст: {text_column.iloc[0][:300]}...") 
print("\n" + "-" * 50 + "\n")

total_samples = len(df)
print(f"Общее количество текстов: {total_samples}")

miss_topics = topic_column.isnull().sum()
print(f"Количество текстов без темы: {miss_topics}")

unique_topic = topic_column.unique()
num_unique_topic = len(unique_topic)
print(f"\nКоличество уникальных тем: {num_unique_topic}")
print("Список уникальных тем:")
print(unique_topic)

class_distribution = topic_column.value_counts()
print("\nРаспределение по классам:")
print(class_distribution)

class_distribution.to_csv("class_distribution_cleaned.csv", index=True, header=["count"], encoding="utf-8")

Пример текста с темой:
Тема: Россия
Текст: Как стало известно агентству Ассошиэйтед Пресс, экипаж последней экспедиции на станцию "Мир" считает ее способной выйти из-под контроля.Командир Виктор Афанасьев сказал: "Мы чувствуем себя хорошо, но грустим, что оставили станцию летать в беспилотном режиме." Gazeta Ru подробно писала о том, что фин...

--------------------------------------------------

Общее количество текстов: 736508
Количество текстов без темы: 0

Количество уникальных тем: 14
Список уникальных тем:
['Россия' 'Мир' 'Экономика' 'Интернет и СМИ' 'Спорт' 'Культура' 'Из жизни'
 'Силовые структуры' 'Наука и техника' 'Бывший СССР' 'Дом' 'Путешествия'
 'Ценности' 'Бизнес']

Распределение по классам:
topic
Россия               160442
Мир                  136620
Экономика             79528
Спорт                 64413
Культура              53796
Бывший СССР           53402
Наука и техника       53136
Интернет и СМИ        44663
Из жизни              27605
Дом                   21734

In [7]:
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm



texts = df.iloc[:, 2].astype(str).tolist()
topic = df.iloc[:, 3].astype(str).tolist()

tokenized_texts = [text.lower().split() for text in texts]

vocab_counter = Counter()
for tokens in tokenized_texts:
    vocab_counter.update(tokens)

vocab = ['<PAD>', '<UNK>'] + [word for word, count in vocab_counter.most_common(20000)]

word2idx = {word: idx for idx, word in enumerate(vocab)}
UNK_idx = word2idx['<UNK>']
PAD_idx = word2idx['<PAD>']

max_len = 128

def encode_text(tokens):
    return [word2idx.get(word, UNK_idx) for word in tokens[:max_len]]

sequences = [encode_text(tokens) for tokens in tqdm(tokenized_texts)]

def pad_seq(seq, maxlen):
    if len(seq) > maxlen:
        return seq[:maxlen]
    else:
        return seq + [PAD_idx] * (maxlen - len(seq))

padded_seq = np.array([pad_seq(seq, max_len) for seq in sequences])

label_encoder = LabelEncoder()
encoded_topics = label_encoder.fit_transform(topic)
num_classes = len(label_encoder.classes_)

100%|██████████| 736508/736508 [00:17<00:00, 42090.84it/s]


In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(padded_seq, encoded_topics, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.texts[idx], dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.long)
        )

batch_size = 64

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, num_classes=14, num_layers=1, bidirectional=False):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

model = LSTMClassifier(len(vocab), num_classes=num_classes)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    val_acc = evaluate(val_loader)
    print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}, Val Acc: {val_acc:.4f}")

Epoch 1/5: 100%|██████████| 8056/8056 [00:41<00:00, 193.29it/s]


Epoch 1, Loss: 7684.9247, Val Acc: 0.8019


Epoch 2/5: 100%|██████████| 8056/8056 [00:41<00:00, 196.34it/s]


Epoch 2, Loss: 4239.6994, Val Acc: 0.8168


Epoch 3/5: 100%|██████████| 8056/8056 [00:41<00:00, 195.80it/s]


Epoch 3, Loss: 3579.5629, Val Acc: 0.8209


Epoch 4/5: 100%|██████████| 8056/8056 [00:40<00:00, 197.54it/s]


Epoch 4, Loss: 3074.1761, Val Acc: 0.8204


Epoch 5/5: 100%|██████████| 8056/8056 [00:42<00:00, 191.50it/s]


Epoch 5, Loss: 2605.0533, Val Acc: 0.8170


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, hidden_dim=256, num_classes=14, num_layers=2, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            bidirectional=True, batch_first=True, dropout=dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)  
        lstm_out, (h_n, _) = self.lstm(x) 
        out = lstm_out[:, -1, :] 
        out = self.batch_norm(out)
        out = self.dropout(out)
        logits = self.fc(out)
        return logits
    


In [None]:
import pickle
import numpy as np

np.save('classes.npy', label_encoder.classes_)
with open('word2idx.pkl', 'wb') as f:
    pickle.dump(word2idx, f)

In [None]:
import torch
from torch.utils.data import DataLoader, SubsetRandomSampler
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import copy


def train_model(model, dataset, val_loader, batch_size=64, epochs=5, patience=3, verbose=True):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=1)

    best_val_acc = 0
    best_model_weights = None
    no_improvement = 0

    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    dataset_size = len(dataset)

    for epoch in range(epochs):
        indices = torch.randperm(dataset_size)
        sampler = SubsetRandomSampler(indices)

        train_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)

        model.train()
        total_loss = 0
        all_preds, all_labels = [], []

        for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            texts, labels = texts.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        train_acc = accuracy_score(all_labels, all_preds)
        avg_loss = total_loss / len(train_loader)

        model.eval()
        val_loss, val_preds, val_true = 0, [], []
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())

        val_acc = accuracy_score(val_true, val_preds)
        val_loss = val_loss / len(val_loader)

        scheduler.step(val_loss)

        history['train_loss'].append(avg_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        if verbose:
            print(f"Epoch {epoch+1}: "
                  f"Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.4f} | "
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(best_model_weights, f"best_bilstm_epoch{epoch+1}_acc{val_acc:.4f}.pth")
            no_improvement = 0
        else:
            no_improvement += 1

        if no_improvement >= patience:
            print("stopping triggered.")
            break

    print(f"Best val accuracy: {best_val_acc:.4f}")
    return history


model = BiLSTMClassifier(vocab_size=len(vocab), num_classes=num_classes)

In [59]:
history = train_model(
    model=model,
    dataset=train_dataset,
    val_loader=val_loader,
    batch_size=64,
    epochs=5,
    patience=5
)

Epoch 1/5:   0%|          | 0/8056 [00:00<?, ?it/s]

Epoch 1/5: 100%|██████████| 8056/8056 [03:59<00:00, 33.65it/s]


Epoch 1: Train Loss: 0.4659, Train Acc: 0.8405 | Val Loss: 0.4840, Val Acc: 0.8300


Epoch 2/5: 100%|██████████| 8056/8056 [03:59<00:00, 33.61it/s]


Epoch 2: Train Loss: 0.4308, Train Acc: 0.8509 | Val Loss: 0.4727, Val Acc: 0.8368


Epoch 3/5: 100%|██████████| 8056/8056 [03:50<00:00, 34.97it/s]


Epoch 3: Train Loss: 0.4012, Train Acc: 0.8600 | Val Loss: 0.4822, Val Acc: 0.8356


Epoch 4/5: 100%|██████████| 8056/8056 [03:53<00:00, 34.48it/s]


Epoch 4: Train Loss: 0.3727, Train Acc: 0.8700 | Val Loss: 0.4831, Val Acc: 0.8349


Epoch 5/5: 100%|██████████| 8056/8056 [04:02<00:00, 33.20it/s]


Epoch 5: Train Loss: 0.2354, Train Acc: 0.9179 | Val Loss: 0.5421, Val Acc: 0.8391
Best validation accuracy: 0.8391


In [9]:
import torch
import pickle
import numpy as np
import re

class BiLSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, hidden_dim=256, num_classes=14, num_layers=2, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                                 bidirectional=True, batch_first=True, dropout=dropout)
        self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2)
        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.batch_norm(out)
        out = self.dropout(out)
        logits = self.fc(out)
        return logits


def preprocess(text):
    text = text.lower()
    tokens = re.sub(r"[^а-яА-ЯёЁa-zA-Z0-9 ]", " ", text).split()
    return tokens


with open("word2idx.pkl", "rb") as f:
    word2idx = pickle.load(f)

label_encoder = np.load("classes.npy", allow_pickle=True)


def predict_topic(model, text, word2idx, label_encoder, max_len=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    tokens = preprocess(text)
    sequence = [word2idx.get(word, word2idx['<UNK>']) for word in tokens]
    sequence = sequence[:max_len] + [word2idx['<PAD>']] * max(0, max_len - len(tokens))

    input_tensor = torch.tensor([sequence], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()

    return label_encoder[predicted_idx]


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiLSTMClassifier(vocab_size=len(word2idx), num_classes=len(label_encoder))
model.load_state_dict(torch.load("best_bilstm_epoch5_acc0.8391.pth", map_location=device))
model.to(device)
model.eval()


if __name__ == "__main__":
    print("Существующие темы:")
    print(unique_topic)
    print("\nМодель загружена. Введенный текст:")
    user_input = "дома кипел чайник. он стоял рядом с креслом, которое досталось ему от бабушки - тоже прошло войну. через пару лет ему вручали нобелевскую премию"
    predicted_topic = predict_topic(model, user_input, word2idx, label_encoder)
    print(user_input)
    print(f"Предсказанная тема: {predicted_topic}\n")

Существующие темы:
['Россия' 'Мир' 'Экономика' 'Интернет и СМИ' 'Спорт' 'Культура' 'Из жизни'
 'Силовые структуры' 'Наука и техника' 'Бывший СССР' 'Дом' 'Путешествия'
 'Ценности' 'Бизнес']

Модель загружена. Введенный текст:
дома кипел чайник. он стоял рядом с креслом, которое досталось ему от бабушки - тоже прошло войну. через пару лет ему вручали нобелевскую премию
Предсказанная тема: Из жизни

