<a href="https://colab.research.google.com/github/Kaban17/nlp/blob/main/Poem_Classification_(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Нейросесть, которая по стихотворному тексту определяет его жанр


In [None]:
from google.colab import files
data = files.upload()
# https://www.kaggle.com/datasets/ramjasmaurya/poem-classification-nlp
# датасет для этой сетки

In [None]:
!unzip archive.zip

Archive:  archive.zip
replace Poem_classification - test_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: Poem_classification - test_data.csv  
  inflating: Poem_classification - train_data.csv  


In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:

# загружаем данные
data_train = pd.read_csv(f'Poem_classification - train_data.csv', skiprows=2, header=None, names=['Genre', 'Poem'])
data_test = pd.read_csv('Poem_classification - test_data.csv', skiprows=2, header=None, names=['Genre', 'Poem'])


In [None]:
# разделяем столбцы на 2 разных
genres_train = data_train['Genre'].values
poems_train = data_train['Poem'].values
genres_test = data_test['Genre'].values
poems_test = data_test['Poem'].values

print("Genres:", genres_train[10:21])
print("Poems:", poems_train[0:2])

Genres: ['Music' 'Music' 'Music' 'Music' 'Music' 'Music' 'Music' 'Music' 'Music'
 'Music' 'Music']
Poems: ['\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 In the thick brushthey spend the hottest part of the day,\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 soaking their hoovesin the trickle of mountain water\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 the ravine hoardson behalf of the oleander.\xa0 \xa0 \xa0 \xa0 \xa0 \xa0'
 '\xa0 \xa0Storms are generous.\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 Something so easy to surrender to, sitting by the window,\xa0and then you step out into the garden you were so bored of,\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0']


In [None]:
# обрабатываем текст
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# проверяем, что все записи являются строками перед обработкой
# если это не проверять, то почему то кидает ошибку
poems_train = [preprocess_text(poem) for poem in poems_train if isinstance(poem, str)]
poems_test = [preprocess_text(poem) for poem in poems_test if isinstance(poem, str)]

In [None]:
print(len(poems_train))

837


In [None]:
# токенизация текстов
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poems_train)
sequences_train = tokenizer.texts_to_sequences(poems_train)
sequences_test = tokenizer.texts_to_sequences(poems_test)

print("First 5 tokenized train sequences:", sequences_train[:5])
print("First 5 tokenized test sequences:", sequences_test[:5])

First 5 tokenized train sequences: [[7, 1, 655, 2961, 1650, 1, 1651, 497, 4, 1, 70, 2962, 49, 2963, 1, 900, 4, 306, 106, 1, 1652, 2964, 2965, 4, 1, 2966], [758, 24, 1653, 175, 41, 498, 5, 2967, 5, 1654, 31, 1, 256, 3, 82, 10, 499, 46, 42, 1, 257, 10, 51, 41, 1141, 4], [69, 2968, 2969, 104, 10, 500, 143, 1, 1655, 237, 104, 10, 288, 2970, 7, 40, 191, 74, 10, 353, 5, 2971, 2972, 901, 2973, 54, 1, 128, 74, 10, 759, 1, 1656, 4, 59, 760, 761, 166, 31, 60, 74], [14, 2974, 2975, 23, 1, 2976, 198, 307, 39, 383, 2977, 2978, 238, 5, 500, 2979, 4, 2980, 15, 22, 2981, 137, 23, 73, 18, 2982, 4, 239, 1, 2983, 44, 1657, 2984, 30, 289, 8], [14, 1658, 2985, 2986, 1659, 107, 8, 1, 1660, 128, 15, 2, 1661, 1142, 30, 8, 1, 308, 2, 258, 93, 67, 7, 2987, 1, 121, 902, 7, 99, 3, 2988, 575, 4, 1662, 2989]]
First 5 tokenized test sequences: [[141, 6, 56, 2, 363, 1, 4, 9, 8514, 386, 1, 701, 4, 7, 9, 1518, 1, 538, 4, 2879, 1, 387, 379, 2038, 2, 39, 27, 2360, 918, 7, 25, 6, 374, 1, 670, 360, 7, 1, 24], [141, 373, 19

In [None]:
# пэддинг последовательностей
maxlen = max(max(len(seq) for seq in sequences_train), max(len(seq) for seq in sequences_test))
padded_sequences_train = pad_sequences(sequences_train, maxlen=maxlen, padding='post')
padded_sequences_test = pad_sequences(sequences_test, maxlen=maxlen, padding='post')

In [None]:
# заменяем жанры на enum
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(genres_train)
labels_test = label_encoder.transform(genres_test)

In [None]:
# создание Dataset и DataLoader
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = TextDataset(padded_sequences_train, labels_train)
test_dataset = TextDataset(padded_sequences_test, labels_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# создаем сеть
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = x[:, -1, :]  # Используем последний выход RNN
        x = self.fc(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = RNNModel(vocab_size=len(tokenizer.word_index) + 1, embed_dim=100, hidden_dim=128, output_dim=len(set(labels_train)))
model.to(device)
# на gpu обучение идет очень быстро

RNNModel(
  (embedding): Embedding(9379, 100)
  (rnn): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)

In [None]:
criterion = nn.CrossEntropyLoss() # функция потерь - кросс энтропия.
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# обучаем нашу нейросетку
# данных мало и уже на 15 эпохах происходит переобучение. это довольно занятно
epochs = 10
for ep_id in range(epochs):
    model.train()
    running_loss = 0.0
    bar = tqdm(train_loader, desc=f'ep_id {ep_id+1}/{epochs}:.2f')
    for texts, labels in bar:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        bar.set_postfix(loss=running_loss/len(train_loader))

    print(f'Ep {ep_id+1}/{epochs}, Loss: {running_loss/len(train_loader):.2f}')

ep_id 1/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 161.47it/s, loss=1.38]


Ep 1/10, Loss: 1.38


ep_id 2/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 166.50it/s, loss=1.38]


Ep 2/10, Loss: 1.38


ep_id 3/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 152.70it/s, loss=1.37]


Ep 3/10, Loss: 1.37


ep_id 4/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 141.86it/s, loss=1.37]


Ep 4/10, Loss: 1.37


ep_id 5/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 147.16it/s, loss=1.37]


Ep 5/10, Loss: 1.37


ep_id 6/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 122.42it/s, loss=1.37]


Ep 6/10, Loss: 1.37


ep_id 7/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 136.53it/s, loss=1.37]


Ep 7/10, Loss: 1.37


ep_id 8/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 121.48it/s, loss=1.37]


Ep 8/10, Loss: 1.37


ep_id 9/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 115.99it/s, loss=1.36]


Ep 9/10, Loss: 1.36


ep_id 10/10:.2f: 100%|██████████| 27/27 [00:00<00:00, 111.57it/s, loss=1.37]

Ep 10/10, Loss: 1.37





In [None]:
# считаем потери
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 7.38%
