# Treinamento de Rede LSTM para classificação

## Libs

In [None]:
import re
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import kagglehub

import gensim.downloader as gd

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

tqdm.pandas()

## Funções

In [49]:
def preprocess_text_to_embedding(text: str, embedding_model, max_seq_length: int = 100) -> torch.Tensor:
    # Remove HTML tags and non-alphanumeric characters
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # Convert to lowercase and split into words
    text = text.lower()
    words = text.split()
    
    # Truncate to max sequence length
    words = words[:max_seq_length]
    
    # Convert words to embeddings, ignoring words not in the embedding model
    embds = torch.tensor(np.array([embedding_model[w] for w in words if w in embedding_model]))
    
    return embds

## Preparação do Dataset

In [50]:
path = kagglehub.dataset_download("hgultekin/bbcnewsarchive")
path = Path(path)

print(f"Diretório principal: {path}")

print("- Arquivos e diretórios filhos:")
for file in path.iterdir():
    print(f"\t{file.name}")

Diretório principal: /home/miguel/.cache/kagglehub/datasets/hgultekin/bbcnewsarchive/versions/1
- Arquivos e diretórios filhos:
	bbc-news-data.csv


In [51]:
data = pd.read_csv(path / 'bbc-news-data.csv', sep='\t')
data.sample(5)

Unnamed: 0,category,filename,title,content
1121,politics,226.txt,Tories unveil quango blitz plans,Plans to abolish 162 quangos have been unveil...
767,entertainment,258.txt,No jail for singer Courtney Love,Singer Courtney Love has been spared jail for...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
1174,politics,279.txt,Amnesty chief laments war failure,The lack of public outrage about the war on t...
1467,sport,155.txt,Benitez joy as Reds take control,Liverpool boss Rafael Benitez was satisfied a...


In [52]:
# APENAS PARA TESTES
data = data.sample(frac=0.1)
data.head(5)

Unnamed: 0,category,filename,title,content
2036,tech,213.txt,Search sites get closer to users,Search sites want to get to know you better. ...
1096,politics,201.txt,Howard rejects BNP's claim,Tory leader Michael Howard has dismissed clai...
2196,tech,373.txt,Software watching while you work,Software that can not only monitor every keys...
240,business,241.txt,G7 backs Africa debt relief plan,G7 finance ministers have backed plans to wri...
364,business,365.txt,Nasdaq planning $100m-share sale,The owner of the technology-dominated Nasdaq ...


## Embeddings

In [53]:
gensim_embedding_model = gd.load("glove-twitter-50")

In [54]:
text = data.sample(1)["title"].values[0]

embds = preprocess_text_to_embedding(text=text, embedding_model=gensim_embedding_model)

print(f"Texto: {text}")
print(f"\nEmbeddings shape: {embds.shape}")

Texto: McCririck out of Big Brother show

Embeddings shape: torch.Size([6, 50])


Pré-processando todas as linhas do dataset.

In [55]:
X = [
    preprocess_text_to_embedding(
        text=t,
        embedding_model=gensim_embedding_model
    ) for t in data["content"]
]

print(f"Total de amostras processadas: {len(X)}")

Total de amostras processadas: 222


## Labels

In [56]:
encoder = LabelEncoder()

y = encoder.fit_transform(data["category"])
y = y.astype(np.int64)
y = torch.tensor(y)

y

tensor([4, 2, 4, 0, 0, 2, 0, 1, 2, 3, 0, 0, 2, 0, 0, 4, 3, 3, 1, 2, 4, 3, 0, 1,
        3, 4, 4, 1, 4, 1, 3, 1, 3, 4, 2, 2, 2, 3, 3, 4, 2, 4, 1, 3, 4, 3, 1, 1,
        4, 2, 0, 2, 2, 2, 3, 4, 4, 3, 4, 2, 3, 0, 4, 1, 2, 0, 0, 0, 4, 1, 2, 4,
        3, 1, 3, 1, 0, 3, 2, 2, 2, 4, 1, 2, 4, 3, 1, 3, 4, 4, 4, 3, 3, 1, 1, 3,
        4, 4, 2, 3, 4, 1, 3, 4, 4, 4, 2, 0, 4, 2, 3, 2, 1, 1, 3, 0, 3, 3, 3, 3,
        1, 1, 3, 3, 3, 0, 0, 4, 4, 1, 4, 2, 4, 1, 0, 3, 1, 1, 0, 2, 4, 3, 3, 4,
        3, 1, 1, 1, 3, 0, 4, 4, 1, 1, 4, 4, 3, 1, 1, 3, 2, 0, 3, 4, 0, 2, 2, 3,
        1, 1, 3, 1, 2, 1, 2, 4, 1, 3, 3, 2, 2, 1, 1, 2, 3, 4, 0, 1, 0, 2, 1, 4,
        0, 0, 1, 1, 3, 4, 3, 1, 2, 2, 2, 0, 2, 0, 0, 3, 4, 2, 3, 2, 4, 1, 1, 4,
        3, 1, 4, 3, 2, 1])

## Dataset e Dataloader

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_ds = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

test_ds = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_ds, batch_size=64)

print(f"Total de treino: {len(train_ds)}")
print(f"Total de teste: {len(test_ds)}")

## Model

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        hidden_size = 32

        self.lstm = nn.LSTM(
            input_size=50,     # embedding size
            hidden_size=hidden_size,
            bidirectional=False,
            batch_first=True,
        )

        self.dropout = nn.Dropout(p=0.5)

        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, sequence):
        _, (hidden, cell) = self.lstm(sequence)
        
        dropped = self.dropout(hidden.squeeze(0))
        
        prediction = self.fc(dropped)
        
        return prediction

net = Model()

## Treinamento e avaliação

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=1e-3)

history = []
n_epochs = 20

# repete por um número de épocas
for epoch in range(n_epochs):
    running_loss = 0.0

    # epoca de treinamento: itera sobre os batches do conjunto de treino
    net.train()
    for i, data in enumerate(train_loader):
        inputs, labels = data

        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    running_loss /= len(train_ds)

    # avaliação no conjunto de teste
    net.eval()
    test_loss = 0.0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

    test_loss /= len(test_ds)

    history.append([running_loss, test_loss])

    if epoch % max(1, n_epochs // 20)  == 0:
        print(f'[{epoch + 1}] loss: {running_loss:.4f} test loss: {test_loss:.4f}')

print('Finished Training')

Evolução da **função de perda** nos conjuntos de treino e teste ao longo do treinamento. 

In [None]:
history = np.array(history)

plt.plot(history[:, 0], '-', color='orange', label='train')
plt.plot(history[:, 1], '-', color='blue', label='test')
plt.legend()

## Teste

In [None]:
all_labels = []
preds = []

net.eval()

with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = net(inputs)
        
        cls = np.argmax(outputs, axis=-1)
        all_labels += list(labels)
        preds += list(cls)

print(classification_report(all_labels, preds))