# IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset, tune_logistic_regression, tune_svm, tune_mlp, evaluate
from modules.autoencoder import AUTOENCODER
from modules.classifier import CLASSIFIER
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier

In [2]:
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import torch.optim as optim
import pickle

# PROJECT SPECIFIC IMPORTS

In [None]:
from modules.preprocess import *
from modules.utils import *

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOAD DATASET

In [11]:
dataset = build_dataset(path="lapresse_crawler/output.json", num_samples=5000, rnd_state=10)

# PREPROCESS DATA

In [10]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=True, lowercase=True, lemmatize=True, html_=True, convert_entities=True, expand=True)

  1%|▉                                                                                                                                                                           | 22/4159 [00:03<09:39,  7.14it/s]


KeyboardInterrupt: 

In [21]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports', 'arts', 'affaires', 'debats']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports', 'arts', 'affaires', 'debats']]

# TRAIN/TEST SPLIT

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)
#X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state = 42)

# VECTORIZE

In [8]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=0.01, max_df=0.99)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test =  vectorizer.transform(X_test)
#tfidf_valid =  vectorizer.transform(X_valid)



In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


# DEFINE MODEL, OPTIMIZER, LOSS_FCT

In [10]:
auto = AUTOENCODER().to(device)
optimizer = optim.Adam(auto.parameters(), lr = 0.01)
loss_function = nn.MSELoss()

# TRAIN AUTOENCODER

In [11]:
tfidf_train_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_train.toarray(), dtype=torch.float32), dim=1).to(device)
tfidf_test_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_test.toarray(), dtype=torch.float32), dim=1).to(device)

batch_size = 8
dataset = TensorDataset(tfidf_train_dense_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(tfidf_test_dense_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

epochs = 10  
best_test_loss = float('inf')

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for batch in dataloader:
        X, = batch
        X = X.to(device)
        auto.train()
        auto_out = auto(X)
        auto.zero_grad()
        loss = loss_function(auto_out, X)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        train_losses.append(loss_value)
    
    for batch in test_dataloader:
        X, = batch  
        X = X.to(device)
        auto.eval()
        auto_out = auto(X)
        loss = loss_function(auto_out, X)
        loss_value = loss.item()
        test_losses.append(loss_value)

    mean_test_loss = np.mean(test_losses)
    print(f'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {mean_test_loss}')

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
        torch.save(auto.state_dict(), 'model_best.pt')  # Save the model
        print(f'Model saved at epoch {epoch} with test loss {mean_test_loss}')

Results for epoch 0:
Mean train loss for epoch: 0.8053089850827267
Mean test loss for epoch: 0.012556259706616401
Model saved at epoch 0 with test loss 0.012556259706616401
Results for epoch 1:
Mean train loss for epoch: 0.5094923675060272
Mean test loss for epoch: 0.013460292108356952
Results for epoch 2:
Mean train loss for epoch: 0.3008909209778434
Mean test loss for epoch: 0.007403052691370249
Model saved at epoch 2 with test loss 0.007403052691370249
Results for epoch 3:
Mean train loss for epoch: 0.16521850463591123
Mean test loss for epoch: 0.011559158749878406
Results for epoch 4:
Mean train loss for epoch: 0.0839757911468807
Mean test loss for epoch: 0.02278509959578514
Results for epoch 5:
Mean train loss for epoch: 0.0392814210959171
Mean test loss for epoch: 0.020283612981438638
Results for epoch 6:
Mean train loss for epoch: 0.01681358631896345
Mean test loss for epoch: 0.009883162565529346
Results for epoch 7:
Mean train loss for epoch: 0.006549991348660306
Mean test loss

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
auto = AUTOENCODER().to(device)  
state_dict = torch.load('model_best.pt', map_location=device)  
auto.load_state_dict(state_dict)

<All keys matched successfully>

# PREDICTOR

In [13]:
tfidf_train_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_train.toarray(), dtype=torch.float32), dim=1).to(device)  
tfidf_test_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_test.toarray(), dtype=torch.float32), dim=1).to(device)  

In [None]:
auto.eval()
autoencoder_train_tensor = []
for tensor_ in tfidf_train_dense_tensor:
    encode_output = auto.encode(torch.unsqueeze(tensor_, dim=1))
    autoencoder_train_tensor.append(encode_output)

autoencoder_train_tensor = torch.stack(autoencoder_train_tensor, dim=1)
shape_ = autoencoder_train_tensor.shape[1:]
autoencoder_train_tensor = autoencoder_train_tensor.view(shape_[0],shape_[1],shape_[2])

In [None]:
autoencoder_test_tensor = []
for tensor_ in tfidf_test_dense_tensor:
    encode_output = auto.encode(torch.unsqueeze(tensor_, dim=1))
    autoencoder_test_tensor.append(encode_output)

autoencoder_test_tensor = torch.stack(autoencoder_test_tensor, dim=1)
shape_ = autoencoder_test_tensor.shape[1:]
autoencoder_test_tensor = autoencoder_test_tensor.view(shape_[0],shape_[1],shape_[2])

In [None]:
pickle.dump(autoencoder_train_tensor, open('autoencoder_train_tensor.pkl', 'wb'))
pickle.dump(autoencoder_test_tensor, open('autoencoder_test_tensor.pkl', 'wb'))

In [None]:
autoencoder_train_tensor = pickle.load(open('autoencoder_train_tensor.pkl', 'rb'))
autoencoder_test_tensor = pickle.load(open('autoencoder_test_tensor.pkl', 'rb'))

In [26]:
len(set(Y))

6

# DEFINE MODEL, OPTIMIZER, LOSS_FCT

In [None]:
classifier = CLASSIFIER(k=5, num_class=4).to(device)
optimizer = optim.Adam(classifier.parameters(), lr = 0.01)
loss_function = nn.CrossEntropyLoss()

# TRAIN CLASSIFIER

In [None]:
batch_size=8

dataset = TensorDataset(autoencoder_train_tensor, torch.tensor(Y_train,dtype=torch.long))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(autoencoder_test_tensor, torch.tensor(Y_test,dtype=torch.long))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

best_test_loss = float('inf')
epochs = 50

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for X, Y in dataloader:  
        X, Y = X.to(device), Y.to(device)
        classifier.train()
        pred_out = classifier(X)
        classifier.zero_grad()
        loss = loss_function(pred_out.view(len(X),-1), Y)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        train_losses.append(loss_value)
    
    for X, Y in test_dataloader:
        X, Y = X.to(device), Y.to(device)
        classifier.eval()
        pred_out = classifier(X)
        loss = loss_function(pred_out.view(len(X),-1), Y)
        loss_value = loss.item()
        test_losses.append(loss_value)
        
    mean_test_loss = np.mean(test_losses)
    print(f'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {mean_test_loss}')

    if mean_test_loss < best_test_loss:
        best_test_loss = mean_test_loss
        torch.save(auto.state_dict(), 'model_best.pt') 
        print(f'Model saved at epoch {epoch} with test loss {mean_test_loss}')

In [None]:
classifier.eval()
pred_outputs = []
for tensor_ in autoencoder_test_tensor:
    encode_output = classifier(torch.unsqueeze(tensor_, dim=0))
    pred_class = np.argmax(encode_output.detach().numpy())
    pred_outputs.append(pred_class)

In [None]:
evaluate(Y_test, pred_outputs)