# IMPORTS

In [1]:
import nltk
import spacy
from modules.utils import build_dataset, tune_logistic_regression, tune_svm, tune_mlp, evaluate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import VotingClassifier

# PROJECT SPECIFIC IMPORTS

In [2]:
from modules.preprocess import *
from modules.utils import *

[nltk_data] Downloading package stopwords to /home/xavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOAD DATASET

In [3]:
dataset = build_dataset(path="lapresse_crawler/output.json", num_samples=500, rnd_state=10)

# PREPROCESS DATA

In [4]:
dataset = text_edit(dataset, grp_num=True, rm_newline=True, rm_punctuation=True,
              rm_stop_words=True, lowercase=True, lemmatize=True, html_=True, convert_entities=True, expand=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:28<00:00,  5.66it/s]


In [5]:
X = [x['text'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports']]
Y = [x['section_label'] for x in dataset.values() if x['section_1'] in ['actualites', 'international', 'sports']]

# TRAIN/TEST SPLIT

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

# VECTORIZE

In [7]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, min_df=0.01, max_df=0.99)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test =  vectorizer.transform(X_test)



In [8]:
vocabulary_sorted_by_value = sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1], reverse=True)

# HYPERPARAMETER TUNING

In [9]:
svm_model = tune_svm(tfidf_train, Y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
Best Hyperparameters: {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


In [10]:
lr_model = tune_logistic_regression(tfidf_train, Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}


In [11]:
nb_model = tune_naive_bayes(tfidf_train, Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'alpha': 0.1, 'fit_prior': True}


In [12]:
mlp_model = tune_mlp(tfidf_train, Y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits




KeyboardInterrupt: 

# TRAIN AND EVAL

In [13]:
#Logistic regression
lr_model.fit(tfidf_train, Y_train)
lr_y_pred = lr_model.predict(tfidf_test)
evaluate(Y_test, lr_y_pred)

Precision:  0.9518459069020866
Recall:  0.9438202247191011
F1_score:  0.9446524572876068
accuracy:  0.9438202247191011


In [14]:
#SVM
svm_model.fit(tfidf_train, Y_train)
svm_y_pred = svm_model.predict(tfidf_test)
evaluate(Y_test, svm_y_pred)

Precision:  0.9438202247191011
Recall:  0.9325842696629213
F1_score:  0.9338376452349353
accuracy:  0.9325842696629213


In [15]:
#NB
nb_model.fit(tfidf_train, Y_train)
nb_y_pred = nb_model.predict(tfidf_test)
evaluate(Y_test, svm_y_pred)

Precision:  0.9438202247191011
Recall:  0.9325842696629213
F1_score:  0.9338376452349353
accuracy:  0.9325842696629213


In [None]:
#MLP
mlp_model.fit(tfidf_train, Y_train)
mlp_y_pred = mlp_model.predict(tfidf_test)
evaluate(Y_test, mlp_y_pred)

# ENSEMBLE METHOD

In [47]:
svm_model = SVC(C=1, degree=2, kernel='linear', probability=True)
lr_model = LogisticRegression(C=100, max_iter=5000, multi_class='multinomial', solver='saga')
mlp_model = MLPClassifier(hidden_layer_sizes=(25,50), activation='relu', max_iter=5000, random_state=42)
nb_model = MultinomialNB(alpha=0.1)

In [51]:
svm_model.fit(tfidf_train, Y_train)
lr_model.fit(tfidf_train, Y_train)
mlp_model.fit(tfidf_train, Y_train)
nb_model.fit(tfidf_train, Y_train)

In [52]:
ensemble_model = VotingClassifier(estimators=[('svm', svm_model), ('nb', nb_model), ('logistic', lr_model), ('mlp', mlp_model)], voting='soft')

In [53]:
ensemble_model.fit(tfidf_train, Y_train)
ensemble_y_pred = ensemble_model.predict(tfidf_test)
evaluate(Y_test, ensemble_y_pred)

Precision:  0.947531494722506
Recall:  0.9438202247191011
F1_score:  0.9444653979559119
accuracy:  0.9438202247191011
