# IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset, tune_logistic_regression, tune_svm, evaluate
import sklearn
from modules.preprocess import spacy_tokenizer, text_edit
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# PROJECT SPECIFIC IMPORTS

In [2]:
from modules.preprocess import *
from modules.utils import *

# LOAD DATASET

In [3]:
dataset = build_dataset(path="lapresse_crawler/output.json", num_class_samples=500, rnd_state=10)

# PREPROCESS DATA

In [4]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=True, lowercase=True, lemmatize=True, html_=True, convert_entities=True)

100%|█████████████████████████████████████████████████████████████████████████████████| 500/500 [01:02<00:00,  7.99it/s]


In [5]:
X = [x['text'] for x in dataset.values()]
Y = [x['section_label'] for x in dataset.values()]

# TRAIN/TEST SPLIT

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

# VECTORIZE

In [7]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test =  vectorizer.transform(X_test)



# HYPERPARAMETER TUNING

In [8]:
svm_model = tune_svm(tfidf_train, Y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits




Best Hyperparameters: {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid'}


In [9]:
lr_model = tune_logistic_regression(tfidf_train, Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}


In [10]:
mlp_model = MLPClassifier(hidden_layer_sizes=(25, 100, 100, 25), activation='relu', max_iter=5000, random_state=42)

# TRAIN AND EVAL

In [21]:
lr_model.fit(tfidf_train, Y_train)
lr_y_pred = lr_model.predict(tfidf_test)
evaluate(Y_test, lr_y_pred)

Precision:  0.7596418300653596
Recall:  0.78
F1_score:  0.7484909423756491
accuracy:  0.78


In [22]:
svm_model.fit(tfidf_train, Y_train)
svm_y_pred = svm_model.predict(tfidf_test)
evaluate(Y_test, svm_y_pred)

Precision:  0.7742273442710417
Recall:  0.7666666666666667
F1_score:  0.7458551385721196
accuracy:  0.7666666666666667


In [23]:
mlp_model.fit(tfidf_train, Y_train)
mlp_y_pred = mlp_model.predict(tfidf_test)
evaluate(Y_test, mlp_y_pred)

Precision:  0.8237411317803475
Recall:  0.72
F1_score:  0.7376868333535
accuracy:  0.72
