In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import gensim
import nltk
from nltk.tokenize import word_tokenize
from modules.preprocess import *
from modules.utils import build_dataset, text_to_word2vec, evaluate
from modules.rnn_model import TextRNN
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from config import *

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
dataset = build_dataset(path="lapresse_crawler", num_samples=20000, rnd_state=10)

  .str.replace("\n\d\n", "")


In [7]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=False, lowercase=True, lemmatize=False, html_=True, convert_entities=False, expand=True)

100%|██████████| 12377/12377 [00:03<00:00, 3585.65it/s]


In [8]:
X = [x['text'] for x in dataset.values() if x['section_1'] in INCLUDED_SECTIONS]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in INCLUDED_SECTIONS]

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [29]:
model_name = 'fasttext-wiki-news-subwords-300'  
word2vec_model = api.load(model_name)

In [38]:
text = "Ceci est un texte exemple"
vector = text_to_word2vec(text, word2vec_model)

In [39]:
input_size = vector.shape[0]  
hidden_size = 256
output_size = len(set(Y_train))  

In [40]:
model = TextRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [41]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_test], dim=0)

In [42]:
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

In [43]:
batch_size = 8

dataset = TensorDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

best_test_loss = float('inf')
epochs = 150

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for X, Y in dataloader:  
        model.train()
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, Y)
        loss.backward() 
        optimizer.step()
        train_losses.append(loss.detach())
    for X, Y in test_dataloader:  
        model.eval()
        outputs = model(X)
        loss = criterion(outputs, Y)
        test_losses.append(loss.detach())

    mean_test_loss = np.mean(test_losses)
    print(f'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {mean_test_loss}')

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
        torch.save(model.state_dict(), 'rnn_best.pt') 
        print(f'Model saved at epoch {epoch} with test loss {mean_test_loss}')

Results for epoch 0:
Mean train loss for epoch: 1.4543715715408325
Mean test loss for epoch: 1.354439616203308
Model saved at epoch 0 with test loss 1.354439616203308
Results for epoch 1:
Mean train loss for epoch: 1.2950513362884521
Mean test loss for epoch: 1.2250109910964966
Model saved at epoch 1 with test loss 1.2250109910964966
Results for epoch 2:
Mean train loss for epoch: 1.2150484323501587
Mean test loss for epoch: 1.178482174873352
Model saved at epoch 2 with test loss 1.178482174873352
Results for epoch 3:
Mean train loss for epoch: 1.1744625568389893
Mean test loss for epoch: 1.1677756309509277
Model saved at epoch 3 with test loss 1.1677756309509277
Results for epoch 4:
Mean train loss for epoch: 1.1323598623275757
Mean test loss for epoch: 1.113921046257019
Model saved at epoch 4 with test loss 1.113921046257019
Results for epoch 5:
Mean train loss for epoch: 1.1090340614318848
Mean test loss for epoch: 1.1303430795669556
Results for epoch 6:
Mean train loss for epoch: 1

KeyboardInterrupt: 

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextRNN(input_size, hidden_size, output_size).to(device)  
state_dict = torch.load('rnn_best.pt', map_location=device)  
model.load_state_dict(state_dict)

<All keys matched successfully>

In [45]:
model.eval()
pred_outputs = []
for tensor_ in X_test:
    output = model(tensor_.view(1,1,-1))
    pred_class = np.argmax(output.detach())
    pred_outputs.append(int(pred_class))

In [46]:
evaluate(Y_test.numpy(), np.array(pred_outputs))

Precision:  0.8783577228409979
Recall:  0.8780290791599353
F1_score:  0.8778465764731235
accuracy:  0.8780290791599353
