In [175]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from mixed_naive_bayes import MixedNB

In [13]:
df = pd.read_csv("../J1/HeartDiseaseUCI.csv", index_col=0)

In [14]:
num_variables = ["age", "trestbps", "chol", "thalach", "ca", "oldpeak"]
cat_variables = ["cp", "restecg", "thal"]
ord_variables = ["slope"]
bin_variables = ["sex", "fbs", "exang"]

In [15]:
df["target"] = np.where(df["num"] >= 1, 1, 0)

In [16]:
X = df.drop(["target", "num"], axis=1)
y = df["target"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    stratify = df["target"],
    random_state = 314
)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(242, 13)
(61, 13)
(242,)
(61,)


### 1. Gaussian Naive Bayes
Les prédictions ne sont faites que sur les variables numériques.

In [21]:
num_pipeline = Pipeline(steps = [
    ("imputer", KNNImputer())
])

In [56]:
preprocessor = ColumnTransformer(transformers=
     [
        ("numeric", num_pipeline, num_variables),
     ],
     remainder="drop"        
 )

In [47]:
pipeline = Pipeline(steps =[
    ("preprocessor", preprocessor),
    ("blabla", GaussianNB())
])

In [90]:
delta = 5*1e-10
var_smoothing_start = 1e-9 - delta
var_smoothing_end = 1e-9 + delta
var_smoothing_step = 1e-11

hyperparameters = {
    "blabla__var_smoothing" : np.arange(var_smoothing_start, var_smoothing_end, var_smoothing_step)
}

In [91]:
gscv_gnb = GridSearchCV(
    estimator=pipeline,
    param_grid=hyperparameters,
    cv = 5,
    verbose = 4,
    scoring = "f1"
)

In [92]:
gscv_gnb.fit(X_train, y_train)

Fitting 5 folds for each of 101 candidates, totalling 505 fits
[CV 1/5] END .......blabla__var_smoothing=5e-10;, score=0.792 total time=   0.0s
[CV 2/5] END .......blabla__var_smoothing=5e-10;, score=0.714 total time=   0.0s
[CV 3/5] END .......blabla__var_smoothing=5e-10;, score=0.667 total time=   0.0s
[CV 4/5] END .......blabla__var_smoothing=5e-10;, score=0.714 total time=   0.0s
[CV 5/5] END .......blabla__var_smoothing=5e-10;, score=0.800 total time=   0.0s
[CV 1/5] END .....blabla__var_smoothing=5.1e-10;, score=0.792 total time=   0.0s
[CV 2/5] END .....blabla__var_smoothing=5.1e-10;, score=0.714 total time=   0.0s
[CV 3/5] END .....blabla__var_smoothing=5.1e-10;, score=0.667 total time=   0.0s
[CV 4/5] END .....blabla__var_smoothing=5.1e-10;, score=0.714 total time=   0.0s
[CV 5/5] END .....blabla__var_smoothing=5.1e-10;, score=0.800 total time=   0.0s
[CV 1/5] END .....blabla__var_smoothing=5.2e-10;, score=0.792 total time=   0.0s
[CV 2/5] END .....blabla__var_smoothing=5.2e-1

[CV 5/5] END blabla__var_smoothing=6.899999999999994e-10;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=6.999999999999993e-10;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=6.999999999999993e-10;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=6.999999999999993e-10;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=6.999999999999993e-10;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=6.999999999999993e-10;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=7.099999999999993e-10;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=7.099999999999993e-10;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=7.099999999999993e-10;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=7.099999999999993e-10;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=7.099999999999993e-10;, score=0.800 total time=   0.0s
[CV 1/5] E

[CV 1/5] END blabla__var_smoothing=8.899999999999987e-10;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=8.899999999999987e-10;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=8.899999999999987e-10;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=8.899999999999987e-10;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=8.899999999999987e-10;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=8.999999999999986e-10;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=8.999999999999986e-10;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=8.999999999999986e-10;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=8.999999999999986e-10;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=8.999999999999986e-10;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=9.099999999999986e-10;, score=0.792 total time=   0.0s
[CV 2/5] E

[CV 4/5] END blabla__var_smoothing=1.0699999999999982e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.0699999999999982e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.079999999999998e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.079999999999998e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.079999999999998e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.079999999999998e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.079999999999998e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.089999999999998e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.089999999999998e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.089999999999998e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.089999999999998e-09;, score=0.714 total time=   0.0s
[CV 5/5]

[CV 4/5] END blabla__var_smoothing=1.2699999999999975e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.2699999999999975e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.2799999999999972e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.2799999999999972e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.2799999999999972e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.2799999999999972e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.2799999999999972e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.2899999999999974e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.2899999999999974e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.2899999999999974e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.2899999999999974e-09;, score=0.714 total time=   0.0s

[CV 2/5] END blabla__var_smoothing=1.4599999999999966e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.4599999999999966e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.4599999999999966e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.4599999999999966e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.4699999999999968e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.4699999999999968e-09;, score=0.714 total time=   0.0s
[CV 3/5] END blabla__var_smoothing=1.4699999999999968e-09;, score=0.667 total time=   0.0s
[CV 4/5] END blabla__var_smoothing=1.4699999999999968e-09;, score=0.714 total time=   0.0s
[CV 5/5] END blabla__var_smoothing=1.4699999999999968e-09;, score=0.800 total time=   0.0s
[CV 1/5] END blabla__var_smoothing=1.4799999999999965e-09;, score=0.792 total time=   0.0s
[CV 2/5] END blabla__var_smoothing=1.4799999999999965e-09;, score=0.714 total time=   0.0s

In [93]:
best_model = gscv_gnb.best_estimator_
best_hyperparameters = gscv_gnb.best_params_

In [94]:
best_hyperparameters

{'blabla__var_smoothing': 5e-10}

In [85]:
best_model

In [86]:
y_pred = best_model.predict(X_test)

In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75        33
           1       0.73      0.57      0.64        28

    accuracy                           0.70        61
   macro avg       0.71      0.69      0.70        61
weighted avg       0.71      0.70      0.70        61



In [87]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.82      0.75        33
           1       0.73      0.57      0.64        28

    accuracy                           0.70        61
   macro avg       0.71      0.69      0.70        61
weighted avg       0.71      0.70      0.70        61



### 2. Categorical Naive Bayes
Les prédictions ne sont faites que sur les variables catégorielles.

In [129]:
cat_pipeline = Pipeline(steps = [
    ("imputer", KNNImputer()),
    ("encoder", OneHotEncoder(drop = "first"))
])
ord_pipeline = Pipeline(steps = [
    ("imputer", KNNImputer()),
    ("encoder", OrdinalEncoder())
])
bin_pipeline = Pipeline(steps = [
    ("imputer", KNNImputer()),
])

In [130]:
preprocessor_2 = ColumnTransformer(transformers=
     [
        ("cat", cat_pipeline, cat_variables),
        ("ord", ord_pipeline, ord_variables),
        ("bin", bin_pipeline, bin_variables),
     ],
     remainder="drop"        
 )

In [135]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
96,52,1,4,128,255,0,0,161,1,0.0,1,1.0,7.0
114,43,0,4,132,341,1,2,136,1,3.0,2,0.0,7.0
68,54,1,3,150,232,0,2,165,0,1.6,1,0.0,7.0
229,54,1,4,110,206,0,2,108,1,0.0,2,1.0,3.0
80,58,1,4,150,270,0,2,111,1,0.8,1,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,43,1,3,130,315,0,0,162,0,1.9,1,1.0,3.0
254,51,0,3,120,295,0,2,157,0,0.6,1,0.0,3.0
113,52,1,1,118,186,0,2,190,0,0.0,2,0.0,6.0
182,56,0,4,134,409,0,2,150,1,1.9,2,2.0,7.0


In [136]:
preprocessor_2

In [131]:
pipeline_2 = Pipeline(steps =[
    ("preprocessor", preprocessor_2),
    ("CategoricalNB", CategoricalNB())
])

In [132]:
pipeline_2.fit(X_train, y_train)

In [133]:
y_pred = pipeline_2.predict(X_test)

In [134]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.91      0.86      0.88        35
           1       0.82      0.88      0.85        26

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



### 3. Mixed Naive Bayes
Les données catégorielles doivent être encodées avec un LabelEncoder.
Mais il y a un problème de compatibilité avec les Pipelines lors du prétraitement.


On impute les valeurs, puis on LabelEncode chaque colonne catégorielle avec un LabelEncoder, mais sans Pipeline.


Attention au ColumnTransformer qui change l'ordre des colonnes.

In [293]:
X_train_clean = X_train.copy()
X_test_clean = X_test.copy()

In [294]:
# Imputation des variables numériques
imputer = KNNImputer()
X_train_clean[num_variables] = imputer.fit_transform(X_train_clean[num_variables])
X_test_clean[num_variables] = imputer.transform(X_test_clean[num_variables])

In [295]:
# Imputation des variables catégorielles
columns_cat = cat_variables + ord_variables + bin_variables

imputer = SimpleImputer(strategy="most_frequent")
X_train_clean[columns_cat] = imputer.fit_transform(X_train_clean[columns_cat])
X_test_clean[columns_cat] = imputer.transform(X_test_clean[columns_cat])

In [296]:
# Encodage des variables catégorielles (LabelEncoder)
le = LabelEncoder()
for col in columns_cat:
    X_train_clean[col] = le.fit_transform(X_train_clean[col])
    X_test_clean[col] = le.transform(X_test_clean[col])

In [297]:
# Indices des colonnes catégorielles
indices = [list(X_train_clean.columns).index(c) for c in columns_cat]

In [298]:
# Création du modèle MixedNB
model_mixed_nb = MixedNB(categorical_features = indices)

In [299]:
model_mixed_nb.fit(X_train_clean, y_train)

MixedNB(alpha=0.5, var_smoothing=1e-09)

In [300]:
y_pred = model_mixed_nb.predict(X_test_clean)

In [301]:
print(classification_report(y_pred, y_test)) 

              precision    recall  f1-score   support

           0       0.85      0.80      0.82        35
           1       0.75      0.81      0.78        26

    accuracy                           0.80        61
   macro avg       0.80      0.80      0.80        61
weighted avg       0.81      0.80      0.80        61

