# IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset, tune_logistic_regression, tune_svm, tune_mlp, evaluate
from modules.autoencoder import AUTOENCODER
from modules.classifier import CLASSIFIER
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier

In [2]:
from sklearn.model_selection import train_test_split
from matplotlib import pyplot
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import torch.optim as optim
import pickle

# PROJECT SPECIFIC IMPORTS

In [3]:
from modules.preprocess import *
from modules.utils import *

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOAD DATASET

In [4]:
dataset = build_dataset(path="lapresse_crawler/output.json", num_samples=500, rnd_state=10)

# PREPROCESS DATA

In [5]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=True, lowercase=True, lemmatize=True, html_=True, convert_entities=True, expand=True)

100%|█████████████████████████████████████████████████████████████████████████████████| 500/500 [01:00<00:00,  8.24it/s]


In [6]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports']]

# TRAIN/TEST SPLIT

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.2, random_state = 42)

# VECTORIZE

In [8]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=0.01, max_df=0.99)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test =  vectorizer.transform(X_test)
tfidf_valid =  vectorizer.transform(X_valid)



# DEFINE MODEL, OPTIMIZER, LOSS_FCT

In [9]:
auto = AUTOENCODER()
optimizer = optim.Adam(auto.parameters(), lr = 0.01)
loss_function = nn.MSELoss()

# TRAIN AUTOENCODER

In [10]:
tfidf_train_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_train.toarray(), dtype=torch.float32), dim=1)
tfidf_test_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_test.toarray(), dtype=torch.float32), dim=1)

batch_size = 8
dataset = TensorDataset(tfidf_train_dense_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(tfidf_test_dense_tensor)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

epochs = 5  

for epoch in range(epochs):
    train_losses = []
    test_losses = []
    for batch in dataloader:
        X, = batch
        auto.train()
        auto_out = auto(X)
        auto.zero_grad()
        loss = loss_function(auto_out, X)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        train_losses.append(loss_value)
    
    for batch in test_dataloader:
        X, = batch  
        auto.eval()
        auto_out = auto(X)
        loss = loss_function(auto_out, X)
        loss_value = loss.item()
        test_losses.append(loss_value)
    print(F'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {np.mean(test_losses)}\n')

Results for epoch 0:
Mean train loss for epoch: 0.7639142101009687
Mean test loss for epoch: 0.019283050438389182

Results for epoch 1:
Mean train loss for epoch: 0.42023882021506626
Mean test loss for epoch: 0.006879863794893026

Results for epoch 2:
Mean train loss for epoch: 0.20652863072852293
Mean test loss for epoch: 0.007382577168755233

Results for epoch 3:
Mean train loss for epoch: 0.0900387978181243
Mean test loss for epoch: 0.019634848227724433

Results for epoch 4:
Mean train loss for epoch: 0.03444446910483142
Mean test loss for epoch: 0.01633050781674683



# PREDICTOR

In [11]:
tfidf_train_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_train.toarray(), dtype=torch.float32), dim=1)
tfidf_valid_dense_tensor = torch.unsqueeze(torch.tensor(tfidf_valid.toarray(), dtype=torch.float32), dim=1)

In [None]:
auto.eval()
autoencoder_train_tensor = []
for tensor_ in tfidf_train_dense_tensor:
    encode_output = auto.encode(torch.unsqueeze(tensor_, dim=1))
    autoencoder_train_tensor.append(encode_output)

autoencoder_train_tensor = torch.stack(autoencoder_train_tensor, dim=1)
shape_ = autoencoder_train_tensor.shape[1:]
autoencoder_train_tensor = autoencoder_train_tensor.view(shape_[0],shape_[1],shape_[2])

In [None]:
autoencoder_valid_tensor = []
for tensor_ in tfidf_valid_dense_tensor:
    encode_output = auto.encode(torch.unsqueeze(tensor_, dim=1))
    autoencoder_valid_tensor.append(encode_output)

autoencoder_valid_tensor = torch.stack(autoencoder_valid_tensor, dim=1)
shape_ = autoencoder_valid_tensor.shape[1:]
autoencoder_valid_tensor = autoencoder_valid_tensor.view(shape_[0],shape_[1],shape_[2])

In [None]:
pickle.dump(autoencoder_train_tensor, open('autoencoder_train_tensor.pkl', 'wb'))
pickle.dump(autoencoder_valid_tensor, open('autoencoder_valid_tensor.pkl', 'wb'))
autoencoder_train_tensor = pickle.load(open('autoencoder_train_tensor.pkl', 'rb'))
autoencoder_valid_tensor = pickle.load(open('autoencoder_valid_tensor.pkl', 'rb'))

# DEFINE MODEL, OPTIMIZER, LOSS_FCT

In [None]:
classifier = CLASSIFIER(k=5, num_class=4, input_shape=343)
optimizer = optim.Adam(classifier.parameters(), lr = 0.01)
loss_function = nn.CrossEntropyLoss()

# TRAIN CLASSIFIER

In [None]:
batch_size=8

dataset = TensorDataset(autoencoder_train_tensor, torch.tensor(Y_train,dtype=torch.long))
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

valid_dataset = TensorDataset(tfidf_valid_dense_tensor)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

epochs = 50

for epoch in range(epochs):
    train_losses = []
    valid_losses = []
    for X, Y in dataloader:  
        classifier.train()
        pred_out = classifier(X)
        classifier.zero_grad()
        loss = loss_function(pred_out.view(len(X),-1), Y)
        loss.backward()
        optimizer.step()
        loss_value = loss.item()
        train_losses.append(loss_value)
    
    for batch in valid_dataloader:
        X, = batch  
        classifier.eval()
        pred_out = classifier(X)
        loss = loss_function(pred_out.view(len(X),-1), Y)
        loss_value = loss.item()
        test_losses.append(loss_value)
        
    print(F'Results for epoch {epoch}:')
    print(f'Mean train loss for epoch: {np.mean(train_losses)}')
    print(f'Mean test loss for epoch: {np.mean(valid_losses)}\n')

In [None]:
classifier.eval()
pred_outputs = []
for tensor_ in autoencoder_valid_tensor:
    encode_output = classifier(torch.unsqueeze(tensor_, dim=0))
    pred_class = np.argmax(encode_output.detach().numpy())
    pred_outputs.append(pred_class)

In [None]:
evaluate(Y_test, pred_outputs)