In [None]:
from sklearn.datasets import fetch_20newsgroups # dataset pour tester

# modèles
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# mise en forme des données
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# pour l'évaluation
from sklearn.model_selection import cross_val_score

# fine-tuning
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np

# Pré-traitement

## Importation des données

In [None]:
categories = [
    "sci.crypt",
    "sci.electronics",
    "sci.med",
    "sci.space",
]

data_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
)

data_test = fetch_20newsgroups(
    subset="test",
    categories=categories,
    shuffle=True,
)

## Vectorisation

In [None]:
vectTfidf = TfidfVectorizer(
    sublinear_tf=True,
    max_df=0.5,
    stop_words='english'
)

X_train = vectTfidf.fit_transform(data_train.data) # données de train vectorisées
y_train = data_train.target

X_test = vectTfidf.transform(data_test.data)
y_test = data_test.target

In [None]:
X_train = X_train.toarray()
X_test = X_test.toarray()

# Classification

In [None]:
param_grids = {
    "gaussianNB":{"var_smoothing":[0.001, 0.01, 0.1, 0.9, 10, 100, 1000]}, # The 'var_smoothing' parameter of GaussianNB must be a float in the range [0, inf)
    "multinomialNB":{"alpha":[0, 0.5, 1.0, 10.0, 100.0, 1000.0], "force_alpha": [True, False], "fit_prior":[True, False]}
}

In [None]:
def get_best_params(model, param_grid, X_train, X_test):
    grid = GridSearchCV(model, param_grid=param_grid, cv = 5, scoring = 'accuracy')
    estimator = grid.fit(X_train, y_train)
    return estimator.best_params_

## Gaussian Naives Bayes

In [None]:
# aperçu de la performance du modèle sur des données textuelles
print(cross_val_score(gNB(), X_train, y_train))

In [None]:
gNB = GaussianNB()
gNB.fit(X_train, y_train)
gNB.score(X_test, y_test)

In [None]:
# optimisation des hyperparamètres

best_params = get_best_params(GaussianNB(), param_grids["gaussianNB"], X_train, X_test)

gNB_FT = GaussianNB(var_smoothing=best_params["var_smoothing"])
gNB_FT.fit(X_train, y_train)

print(gNB_FT.score(X_test, y_test))

## Multinomial Naives Bayes

In [None]:
mNB = MultinomialNB()
mNB.fit(X_train, y_train)
mNB.score(X_test, y_test)

In [124]:
from sklearn.naive_bayes import MultinomialNB


param_grid = {"alpha":[0, 0.5, 1.0, 10.0, 100.0, 1000.0], "force_alpha": [True, False], "fit_prior":[True, False]} # The 'var_smoothing' parameter of GaussianNB must be a float in the range [0, inf)
grid = GridSearchCV(MultinomialNB(), param_grid=param_grid, cv = 5, scoring = 'accuracy')
estimator = grid.fit(X_train, y_train)

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b


In [125]:
df = pd.DataFrame(estimator.cv_results_)
df.sort_values("rank_test_score")
best_params = estimator.best_params_
df.sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_fit_prior,param_force_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.441701,0.005679,0.029991,0.001664,0.5,True,True,"{'alpha': 0.5, 'fit_prior': True, 'force_alpha...",0.976842,0.974737,0.966316,0.966245,0.974684,0.971765,0.004545,1
5,0.439629,0.003393,0.031071,0.002639,0.5,True,False,"{'alpha': 0.5, 'fit_prior': True, 'force_alpha...",0.976842,0.974737,0.966316,0.966245,0.974684,0.971765,0.004545,1
6,0.441042,0.007751,0.030056,0.001305,0.5,False,True,"{'alpha': 0.5, 'fit_prior': False, 'force_alph...",0.976842,0.974737,0.966316,0.966245,0.974684,0.971765,0.004545,1
7,0.449275,0.006605,0.029409,0.002492,0.5,False,False,"{'alpha': 0.5, 'fit_prior': False, 'force_alph...",0.976842,0.974737,0.966316,0.966245,0.974684,0.971765,0.004545,1
1,0.515688,0.021681,0.031536,0.000422,0.0,True,False,"{'alpha': 0, 'fit_prior': True, 'force_alpha':...",0.966316,0.964211,0.964211,0.972574,0.959916,0.965445,0.004128,5
3,0.468135,0.031622,0.031107,0.002867,0.0,False,False,"{'alpha': 0, 'fit_prior': False, 'force_alpha'...",0.966316,0.964211,0.964211,0.972574,0.959916,0.965445,0.004128,5
11,0.441574,0.009108,0.029216,0.001162,1.0,False,False,"{'alpha': 1.0, 'fit_prior': False, 'force_alph...",0.970526,0.972632,0.957895,0.957806,0.966245,0.965021,0.006206,7
10,0.438185,0.008065,0.028933,0.001578,1.0,False,True,"{'alpha': 1.0, 'fit_prior': False, 'force_alph...",0.970526,0.972632,0.957895,0.957806,0.966245,0.965021,0.006206,7
8,0.442017,0.005085,0.029655,0.001869,1.0,True,True,"{'alpha': 1.0, 'fit_prior': True, 'force_alpha...",0.970526,0.972632,0.957895,0.957806,0.964135,0.964599,0.00618,9
9,0.441106,0.010774,0.030375,0.000785,1.0,True,False,"{'alpha': 1.0, 'fit_prior': True, 'force_alpha...",0.970526,0.972632,0.957895,0.957806,0.964135,0.964599,0.00618,9


In [None]:
mNB_FT = MultinomialNB(fit_prior=best_params["fit_prior"], alpha=best_params["alpha"], force_alpha=best_params["force_alpha"])
mNB_FT.fit(X_train, y_train)
print(gNB_FT.score(X_test, y_test))