# Importer les librairies, et fichiers .py auxiliaires

In [None]:
import numpy as np
from preprocess_data import *
from bayes_classifier import BayesClassifier
from complement_naive_bayes import train_cnb_with_tfidf, train_cnb
from ensemble_learning import train_ensemble
from xgboost_classifier import train_xgboost, train_xgboost_with_tfidf

from save_output import save_output

In [None]:
data_preprocess = DataPreprocess()
X_train, y_train, X_test = data_preprocess.train, data_preprocess.label_train, data_preprocess.test

# Exploration de données

# Jalon 1) Naive de Bayes vanille
K-Fold Validation croisée ($k = 7)$:
- α: Lissage du postérieur de Bayes

In [None]:
bayes_classifier = BayesClassifier()
best_lps, best_f1 = bayes_classifier.hyperparameter_tuning(X_train, y_train)
print(best_lps, best_f1)
tuned_bayes_classifier = BayesClassifier()
tuned_bayes_classifier.fit(X_train, y_train, best_lps)
predictions = np.array([tuned_bayes_classifier.predict(x_i) for x_i in X_test])
save_output(predictions, "bayes", best_lps, "stopwords")

# Jalon 2)
# Naive Bayes Complement
K-Fold validation croisée stratifiée ($k = 5)$:
- α: Lissage du postérieur de Bayes

Prétraitement: TFIDF

In [None]:
complement_naive_bayes, tfidf_transformer = train_cnb_with_tfidf(X_train, y_train)
tfidf_test = tfidf_transformer.transform(X_test)
predictions = complement_naive_bayes.predict(tfidf_test)
save_output(predictions, "cnb", "random_search_50_iter", "tfidf")

Prétraitement: Sans prétraitement

In [None]:
complement_naive_bayes = train_cnb(X_train, y_train)
predictions = complement_naive_bayes.predict(X_test)
save_output(predictions, "cnb", "random_search_50_iter", "no_preprocessing")

Prétraitement: TFIDF, réduction par arbre, stopwords retirés

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
X_train, sorted_indeces_features = tree_based_dimensionality_reduction(data_preprocess.train, data_preprocess.label_train)
X_test = data_preprocess.test[:, sorted_indeces_features]
complement_naive_bayes = train_cnb(X_train, data_preprocess.label_train)
predictions = complement_naive_bayes.predict(data_preprocess.test)
save_output(predictions, "cnb", "random_search_50_iter", "tree_reduction_stopwords_tfidf")

# SVC

# SGDClassifier

# XGBoost
Random Search, K-Fold validation croisée stratifiée ($k = 5)$, avec hyperparamètres:
- Learning rate: Taille du pas lors de la descente de critère sur la perte (gain de subdivision) 
- Nombre d'estimateurs (n_estimators): Nombre d'arbres de décision
- Profondeur maximale (max_depth): Profondeur maximale de chaque arbre (nombre de branchements max)
- sous-échantillon (subsample):  Proportion de données utilisées pour produire chaque arbre.

Prétraitement: Sur-échantillonnage bootstrap

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
x_train_oversampled, y_train_oversampled = boostrap_oversampling(data_preprocess.train, data_preprocess.label_train)
xgboost_classifier, tfidf_transformer = train_xgboost(x_train_oversampled, y_train_oversampled)
predictions = xgboost_classifier.predict(data_preprocess.test)
save_output(predictions, "xgboost", "random_search_15_iter", "stopwords_bootstrap")

Prétraitement: 

# Apprentissage par Ensemble

Ensmeble: CNB, XGBoost, Logistic Regression sans stopwords, sous-échantillonage

Prétraitement : sans stopwords, sous-échantillonage, réduction de dimension par somme cumulative

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
indeces_to_remove =remove_cum_sum(data_preprocess.train, 0.95)
data_preprocess.train = np.delete(data_preprocess.train, indeces_to_remove, axis=1)
data_preprocess.test = np.delete(data_preprocess.test, indeces_to_remove, axis=1)
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)
model_names = ['ComplementNB', 'XGBoost', 'LogisticRegression']
best_ensemble_model = train_ensemble(X_train_undersampled, y_train_undersampled, model_names)
predictions_voter = best_ensemble_model.predict(data_preprocess.test)
save_output(predictions_voter, "ensemble_cnb_xgboost_logreg", "random_search_15_iter", "stopwords_undersampling_cumulative_sum")

Prétraitement : sans stopwords, sous-échantillonage

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)
model_names = ['ComplementNB', 'XGBoost', 'LogisticRegression']
best_ensemble_model = train_ensemble(X_train_undersampled, y_train_undersampled, model_names)
predictions_voter = best_ensemble_model.predict(data_preprocess.test)
save_output(predictions_voter, "ensemble_cnb_xgboost_logreg", "random_search_15_iter", "stopwords_undersampling_cumulative_sum")