# Importer les librairies, et fichiers .py auxiliaires

In [None]:
import numpy as np
from preprocess_data import *
from complement_naive_bayes import train_cnb_with_tfidf, train_cnb
from ensemble_learning import train_ensemble
from xgboost_classifier import train_xgboost
from logistic_regression import *
from svc_classifier import *
from sgd_classifier import *
from sklearn.model_selection import StratifiedKFold

from save_output import save_output

In [None]:
data_preprocess = DataPreprocess()
X_train, y_train, X_test = data_preprocess.train, data_preprocess.label_train, data_preprocess.test

# Jalon 2)
# Complement Naive Bayes
K-Fold validation croisée stratifiée ($k = 5)$:
- α: Lissage du postérieur de Bayes
 
Prétraitement: TFIDF

In [None]:
complement_naive_bayes, tfidf_transformer = train_cnb_with_tfidf(X_train, y_train)
tfidf_test = tfidf_transformer.transform(X_test)
predictions = complement_naive_bayes.predict(tfidf_test)
save_output(predictions, "cnb", "random_search_10_iter", "tfidf")

Prétraitement: TFIDF, réduction par arbre, stopwords retirés

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
X_train, sorted_indices_features = tree_based_dimensionality_reduction(data_preprocess.train, data_preprocess.label_train)
X_test = data_preprocess.test[:, sorted_indices_features]
complement_naive_bayes, tfidf_transformer = train_cnb_with_tfidf(X_train, data_preprocess.label_train)
tfidf_test = tfidf_transformer.transform(X_test)
predictions = complement_naive_bayes.predict(tfidf_test)
save_output(predictions, "cnb", "random_search_10_iter", "tree_reduction_stopwords_tfidf")

Prétraitement: TFIDF, réduction par arbre, stopwords retirés, suréchantillonnage SMOTE

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
X_train, sorted_indices_features = tree_based_dimensionality_reduction(data_preprocess.train, data_preprocess.label_train)
X_test = data_preprocess.test[:, sorted_indices_features]
X_train, y_train = smote_oversampling(X_train, data_preprocess.label_train)

complement_naive_bayes = train_cnb(X_train, y_train)
tfidf_test = tfidf_transformer.transform(X_test)
predictions = complement_naive_bayes.predict(tfidf_test)

save_output(predictions, "cnb", "random_search_10_iter", "tree_reduction_stopwords_tfidf_smote")

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

complement_naive_bayes = train_cnb(X_train_undersampled, y_train_undersampled)
predictions = complement_naive_bayes.predict(data_preprocess.test)

save_output(predictions, "cnb", "random_search_10_iter", "stopwords_cum-sum_undersampled")

# SVC
Grid Search, K-Fold validation croisée stratifiée ($k = 5)$, avec hyperparamètres:
- $\gamma$ : Coefficient du Noyau RBF
- $C$ : Terme de régularization pour la pénalité euclidienne.

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

best_params_, best_score_ = train_svc(X_train_undersampled, y_train_undersampled)
print(best_params_, best_score_)
svc = SVC(kernel='rbf', C=best_params_['C'], gamma=best_params_['gamma'])
svc.fit(X_train_undersampled, y_train_undersampled)
y_pred = svc.predict(data_preprocess.test)
params = f"C={best_params_['C']}, gamma={best_params_['gamma']}"
save_output(y_pred, "svm", params, "stopwords_cum-sum_undersampled")

# SGDClassifier
Random Search, K-Fold validation croisée stratifiée ($k = 5)$, avec hyperparamètres:
- loss: ModifiedHuber 
- penalty: ElaticNet 
- l1_ratio: Porportion de la perte d'ElasticNet qui est l1.
- $\alpha$: Poids attribué au terme de régularisation. Une plus grande valeur favorise que certains coefficients soient annulés (par l1) ou fortement adoucis (par l2) quand la pénalité est ElaticNet. 

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

best_sgd = train_sgd(X_train_undersampled, y_train_undersampled)
predictions_voter = best_sgd.predict(data_preprocess.test)
save_output(predictions_voter, "sgd", "random_search_10_iter", "stopwords_undersampling_cumulative_sum")

# Régression Logistique

Random Search, K-Fold validation croisée stratifiée ($k = 5)$, avec hyperparamètres: 
- penalty: Terme de régularisation l1 
- solver: Porportion de la perte d'ElasticNet qui est l1.
- $C$: Poids de la régularisation

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

best_params_, best_score_ = train_logreg(X_train_undersampled, y_train_undersampled)
print(best_params_, best_score_)
svc = LogisticRegression(solver='liblinear', penalty = 'l1', C=best_params_['C'])
svc.fit(X_train_undersampled, y_train_undersampled)
y_pred = svc.predict(data_preprocess.test)
params = f"C={best_params_['C']}, gamma={best_params_['gamma']}"
save_output(y_pred, "logreg", params, "stopwords_cum-sum_undersampled")

# XGBoost
Random Search, K-Fold validation croisée stratifiée ($k = 5)$, avec hyperparamètres:
- Learning rate: Taille du pas lors de la descente de critère sur la perte (gain de subdivision) 
- Nombre d'estimateurs (n_estimators): Nombre d'arbres de décision
- Profondeur maximale (max_depth): Profondeur maximale de chaque arbre (nombre de branchements max)
- sous-échantillon (subsample):  Proportion de données utilisées pour produire chaque arbre.

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

xgboost_classifier = train_xgboost(X_train_undersampled, y_train_undersampled)
predictions = xgboost_classifier.predict(data_preprocess.test)
save_output(predictions, "xgboost", "random_search_10_iter", "stopwords_undersampling_cumulative_sum")

# Apprentissage par ensembles: CNB, XGBoost, Logistic Regression

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
model_names = ['ComplementNB', 'XGBoost', 'LogisticRegression']
# Validation croisée du voteur
for train_index, test_index in kf.split(X_train_undersampled, y_train_undersampled):
    # Split données
    X_train, X_test = X_train_undersampled[train_index], X_train_undersampled[test_index]
    y_train, y_test = y_train_undersampled[train_index], y_train_undersampled[test_index]
    best_ensemble_model = train_ensemble(X_train, y_train, model_names)    
    y_pred = best_ensemble_model.predict(X_test)
    score = f1_score(y_test, y_pred)
    scores.append(score)

mean_score = np.mean(scores)
print("Score F1 de validation du voteur: ", mean_score)

best_ensemble_model = train_ensemble(X_train_undersampled, y_train_undersampled, model_names)
predictions_voter = best_ensemble_model.predict(data_preprocess.test)
save_output(predictions_voter, "ensemble_cnb_xgboost_logreg", "random_search_10_iter", "stopwords_undersampling_cumulative_sum")

Prétraitement: Retirer stopwords, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
model_names = ['ComplementNB', 'XGBoost', 'LogisticRegression']
# Validation croisée du voteur
for train_index, test_index in kf.split(X_train_undersampled, y_train_undersampled):
    # Split données
    X_train, X_test = X_train_undersampled[train_index], X_train_undersampled[test_index]
    y_train, y_test = y_train_undersampled[train_index], y_train_undersampled[test_index]
    best_ensemble_model = train_ensemble(X_train, y_train, model_names)    
    y_pred = best_ensemble_model.predict(X_test)
    score = f1_score(y_test, y_pred)
    scores.append(score)

mean_score = np.mean(scores)
print("Score F1 de validation du voteur: ", mean_score)

best_ensemble_model = train_ensemble(X_train_undersampled, y_train_undersampled, model_names)
predictions_voter = best_ensemble_model.predict(data_preprocess.test)
save_output(predictions_voter, "ensemble_cnb_xgboost_logreg", "random_search_10_iter", "stopwords_undersampling")

# Apprentissage par ensembles: Complement Naive Bayes, XGBoost, SVC, SGD

Prétraitement: Retirer stopwords, Réduction par somme cumulative, sous-échantillonage aléatoire

In [None]:
data_preprocess = DataPreprocess()
data_preprocess.remove_stopwords()
data_preprocess.remove_cum_sum()
X_train_undersampled, y_train_undersampled = random_undersampling(data_preprocess.train, data_preprocess.label_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
model_names = ['ComplementNB', 'XGBoost', 'SVC', 'SGD']
# Validation croisée du voteur
for train_index, test_index in kf.split(X_train_undersampled, y_train_undersampled):
    # Split données
    X_train, X_test = X_train_undersampled[train_index], X_train_undersampled[test_index]
    y_train, y_test = y_train_undersampled[train_index], y_train_undersampled[test_index]
    best_ensemble_model = train_ensemble(X_train, y_train, model_names)    
    y_pred = best_ensemble_model.predict(X_test)
    score = f1_score(y_test, y_pred)
    scores.append(score)

mean_score = np.mean(scores)
print("Score F1 de validation du voteur: ", mean_score)

best_ensemble_model = train_ensemble(X_train_undersampled, y_train_undersampled, model_names)
predictions_voter = best_ensemble_model.predict(data_preprocess.test)
save_output(predictions_voter, "ensemble_cnb_xgboost_svc_sgd", "random_search_10_iter", "stopwords_undersampling_cumulative_sum")