In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gensim
import nltk
from modules.preprocess import *
from modules.utils import *
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import torch.optim as optim
import pickle

In [3]:
import gensim.downloader as api
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/xavier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
dataset = build_dataset(path="lapresse_crawler", num_samples=10000, rnd_state=10)

In [7]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=False, lowercase=True, lemmatize=False, html_=True, convert_entities=False, expand=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:03<00:00, 2876.73it/s]


In [8]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'international']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'sports', 'international']]

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

In [None]:
model_name = 'fasttext-wiki-news-subwords-300'  
word2vec_model = api.load(model_name)

def preprocess_text(text, language='french'):
    return word_tokenize(text.lower(), language=language)

def text_to_word2vec(text, model):
    words = preprocess_text(text)
    vectors = [model[word] for word in words if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
class TextRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TextRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        out, _ = self.rnn(x)  
        out = self.fc(out[:, -1, :])
        return self.softmax(out)

In [None]:
input_size = vector.shape[0]  
hidden_size = 128
output_size = len(set(Y_train))  

In [None]:
model = TextRNN(input_size, hidden_size, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
X_train = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_train], dim=0)
X_test = torch.stack([torch.tensor(text_to_word2vec(x, word2vec_model), dtype=torch.float32).view(1,-1) for x in X_test], dim=0)

In [None]:
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)

In [None]:
batch_size = 8

dataset = TensorDataset(X_train, Y_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

best_test_loss = float('inf')
epochs = 150

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for X, Y in dataloader:  
        model.train()
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, Y)
        loss.backward() 
        optimizer.step()
        train_losses.append(loss.detach())
    for X, Y in test_dataloader:  
        model.eval()
        outputs = model(X)
        loss = criterion(outputs, Y)
        test_losses.append(loss.detach())

    mean_test_loss = np.mean(test_losses)
    print(f'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {mean_test_loss}')

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
        torch.save(model.state_dict(), 'rnn_best.pt') 
        print(f'Model saved at epoch {epoch} with test loss {mean_test_loss}')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextRNN(input_size, hidden_size, output_size).to(device)  
state_dict = torch.load('rnn_best.pt', map_location=device)  
model.load_state_dict(state_dict)

In [None]:
model.eval()
pred_outputs = []
for tensor_ in X_test:
    output = model(tensor_.view(1,1,-1))
    pred_class = np.argmax(output.detach())
    pred_outputs.append(int(pred_class))

In [None]:
evaluate(Y_test.numpy(), np.array(pred_outputs))