# Intro

Name: Kawshik Shankar Ghosh

ID: M230205001

Batch: MSc 14th

In [None]:
!pip install scipy



# Problem

1. Dataset Loading:
  * Load the Pima Indians Diabetes dataset from google classroom.
2. Data Preprocessing:
  * Split the data into training and test sets (80% training, 20% testing).
  * Standardize the features to have zero mean and unit variance.
3. Feature Selection (Optional):
  * Optionally, apply feature selection techniques such as chi-square/SelectKBest to retain the top features.
4. Hyperparameter Tuning:
  * For KNN, tune the number of neighbors (n_neighbors) and the distance metric (metric).
  * For Naive Bayes, tune the smoothing parameter (alpha).
5. K-Fold Cross-Validation:
  * Use Stratified K-Fold Cross-Validation with 5 folds to evaluate model performance.
6. Training and Testing:
  * Train the selected classifier (Naive Bayes or KNN) using the best hyperparameters identified.
  * Evaluate the model on the test set and report the accuracy.
7. Model Evaluation:
  * Print the cross-validation scores and the mean accuracy.
  * Print the test accuracy of the final model.


# Solution

1. Load csv

In [1]:
# load all library

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
db=pd.read_csv('/content/diabetes.csv')
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
db.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


-------------------------------------------


2. Preprocess

In [4]:
# check null value

db.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [5]:
db.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# Split data
from sklearn.model_selection import train_test_split

X = db.drop('Outcome', axis=1)
y = db['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Standardize data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Normalization
from scipy.stats import zscore

scaler = zscore(db)
_normdata = pd.DataFrame(scaler)

_normdata[:5]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995,1.365896
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672,-0.73212
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584,1.365896
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549,-0.73212
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496,1.365896


---------------------------------------------------


3. KNN Classifier & Accuracy

In [18]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=95)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
_accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {100*_accuracy:.2f}%")
evaluation_analysis(y_test, y_pred)

Accuracy: 75.97%
accuracy:  0.7597402597402597
f1 score macro:  0.7036459145992614
f1 score micro:  0.7597402597402597
precision score:  0.7597402597402597
recall score:  0.7597402597402597


In [10]:
from sklearn.model_selection import cross_val_score
import numpy as np

knn = KNeighborsClassifier(n_neighbors=95)

# Using K-Fold Cross Validation
scores = cross_val_score(knn, X, y, cv=5)
print(f"Cross Validation Scores: {scores}")
print(f"Mean CV Score: {np.mean(scores)}")

Cross Validation Scores: [0.72727273 0.7012987  0.73376623 0.71895425 0.70588235]
Mean CV Score: 0.7174348527289705


------------------------------------

4. Hyperparameter Tuning

In [24]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB

X =db.iloc[:, :-1]
#X=X.values
y=db['Outcome']
#y=y.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier()
nb=MultinomialNB()

param_grid = {
    'n_neighbors': [3,4,5,6,7,8]
} # for KNN

from sklearn.metrics.pairwise import pairwise_distances

distance_matrix = pairwise_distances(db, metric='minkowski', p=3)

print(distance_matrix)

param_nb={'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]} # for Naive Bayes

grid_search = GridSearchCV(estimator=nb, param_grid=param_nb, cv=5, n_jobs=1)

grid_search.fit(X_train, y_train)
print("Grid Search Best Parameters:", grid_search.best_params_)
print("Grid Search Best Score:", grid_search.best_score_)

[[  0.          63.64509126  45.33814847 ... 112.78682444  38.13201922
   57.11220315]
 [ 63.64509126   0.          98.8525496  ... 113.23924213  46.05190659
   10.47954308]
 [ 45.33814847  98.8525496    0.         ... 118.30952604  57.41721831
   91.27557875]
 ...
 [112.78682444 113.23924213 118.30952604 ...   0.         112.50413997
  112.60664359]
 [ 38.13201922  46.05190659  57.41721831 ... 112.50413997   0.
   43.18775427]
 [ 57.11220315  10.47954308  91.27557875 ... 112.60664359  43.18775427
    0.        ]]
Grid Search Best Parameters: {'alpha': 1000}
Grid Search Best Score: 0.6288151406104225


------------------------------

 5. K-Fold Cross Validation

In [25]:
from sklearn.model_selection import cross_val_score
import numpy as np

knn = KNeighborsClassifier(n_neighbors=8)

# Using K-Fold Cross Validation
scores = cross_val_score(knn, X, y, cv=5)
print(f"Cross Validation Scores: {scores}")
print(f"Mean CV Score: {np.mean(scores)}")

Cross Validation Scores: [0.74025974 0.72077922 0.75974026 0.79084967 0.73856209]
Mean CV Score: 0.7500381970970207


-------------------------------------------------


6. Training & Testing

In [34]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

X =db.iloc[:, :-1]
X=X.values
y=db['Outcome']
y=y.values

# create loocv procedure
cv = LeaveOneOut()

# enumerate splits
y_true, y_pred = list(), list()

for train_ix, test_ix in cv.split(X):
 # split data
 X_train, X_test = X[train_ix, :], X[test_ix, :]
 y_train, y_test = y[train_ix], y[test_ix]

 # fit model
 knn.fit(X_train, y_train)

 # evaluate model
 yhat = knn.predict(X_test)

 # store
 y_true.append(y_test[0])
 y_pred.append(yhat[0])

# accuracy
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {100*acc:.2f}%")

Accuracy: 73.31%


In [35]:
# Avg accuracy
from sklearn.model_selection import StratifiedKFold

X =db.iloc[:, :-1]
X=X.values
y=db['Outcome']
y=y.values


knn = KNeighborsClassifier(n_neighbors=8)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

# Split and train model
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    #print(train_index)
    y_train, y_test = y[train_index], y[test_index]
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))

average_score = np.mean(scores)
print(f"Average Accuracy: {100*average_score:.4f}%")

Average Accuracy: 73.6932%


----------------------------------


7. Model Evaluation

In [36]:
from sklearn.model_selection import cross_val_score
import numpy as np

knn = KNeighborsClassifier(n_neighbors=8)

# Using K-Fold Cross Validation
scores = cross_val_score(knn, X, y, cv=5)
print(f"Cross Validation Scores: {scores}")
print(f"Mean CV Score: {np.mean(scores)}")

Cross Validation Scores: [0.74025974 0.72077922 0.75974026 0.79084967 0.73856209]
Mean CV Score: 0.7500381970970207
