In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset = pd.read_csv("CKD.csv")

In [3]:
dataset = pd.get_dummies(dataset, dtype = int, drop_first = True)

In [4]:
independent = dataset.drop(columns = ["classification_yes"])

In [5]:
dependent = dataset["classification_yes"]

In [6]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(independent, dependent, test_size = 0.3, random_state = 0)

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

# Step 1: Create a single pipeline with a placeholder for the classifier
pipe = Pipeline([('classifier', GaussianNB())]) # We use a default classifier as a placeholder

# Step 2: Define the parameter grid.
# The double underscore (__) notation is used to specify parameters for a step in the pipeline.
paramgrid = [
    # Parameters for Gaussian Naive Bayes
    {'classifier': [GaussianNB()],
     'classifier__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]},

    # Parameters for Multinomial Naive Bayes
    {'classifier': [MultinomialNB()],
     'classifier__alpha': [0.1, 0.5, 1.0],
     'classifier__fit_prior': [True, False]},

    # Parameters for Bernoulli Naive Bayes
    {'classifier': [BernoulliNB()],
     'classifier__alpha': [0.1, 0.5, 1.0],
     'classifier__fit_prior': [True, False]}
]

# Step 3: Pass the pipeline as the estimator and the new paramgrid
grid = GridSearchCV(pipe, param_grid=paramgrid, refit=True, verbose=3, n_jobs=-1, scoring='f1_weighted')

# Assuming xtrain and ytrain are defined
grid.fit(xtrain, ytrain)

# After fitting, you can access the best model and parameters
# print("Best Score:", grid.best_score_)
# print("Best Parameters:", grid.best_params_)
# print("Best Estimator:", grid.best_estimator_)

Fitting 5 folds for each of 17 candidates, totalling 85 fits


In [18]:
print("Best Score:", grid.best_score_)
print("Best Parameters:", grid.best_params_)
("Best Estimator:", grid.best_estimator_)

Best Score: 0.9856801452274038
Best Parameters: {'classifier': GaussianNB(), 'classifier__var_smoothing': 1e-09}


('Best Estimator:', Pipeline(steps=[('classifier', GaussianNB())]))

In [19]:
re = grid.cv_results_

In [20]:
ypred = grid.predict(xtest)

In [24]:
#from sklearn.metrics import f1_score
#f1score = f1_score(ytest,ypred)
#print("The best parameters: {}" .format(grid.best_params_),f1score)

print("The best parameters: {}" .format(grid.best_params_), grid.best_score_)

The best parameters: {'classifier': GaussianNB(), 'classifier__var_smoothing': 1e-09} 0.9856801452274038


In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest,ypred)
print("The best confusion matrix: \n",cm)

The best confusion matrix: 
 [[45  0]
 [ 2 73]]


In [26]:
from sklearn.metrics import classification_report
clfreport = classification_report(ytest,ypred)
print("The best classification report: \n", clfreport)

The best classification report: 
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        45
           1       1.00      0.97      0.99        75

    accuracy                           0.98       120
   macro avg       0.98      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



In [27]:
from sklearn.metrics import roc_auc_score
ras = roc_auc_score(ytest,grid.predict_proba(xtest)[:,1])
print(ras)

1.0
