In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
data = pd.read_csv(r'C:\Users\91636\OneDrive\Desktop\6th sem\ML\ML lab\datasets\diabetes2.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,,137,40.0,35.0,168.0,43.1,2.288,,1.0


In [3]:
data.shape

(768, 9)

In [4]:
imputer = SimpleImputer(strategy = 'median')
data_imputed = pd.DataFrame(imputer.fit_transform(data),columns=data.columns)
data_imputed.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,3.0,137.0,40.0,35.0,168.0,43.1,2.288,29.0,1.0


In [5]:
for column in ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']:
    median_value = data_imputed[column].median()
    data_imputed[column] = data_imputed[column].replace(0,median_value)

In [6]:
data_imputed.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,34.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,34.0,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,23.0,34.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,3.0,137.0,40.0,35.0,168.0,43.1,2.288,29.0,1.0


In [7]:
x = data_imputed.drop('Outcome',axis=1)
y = data_imputed['Outcome']

In [8]:
scaler = StandardScaler()
x_scale = scaler.fit_transform(x)

In [15]:
X_train,X_test,y_train,y_test = train_test_split(x_scale,y,test_size=0.2,random_state=42)
y_train=y_train.values

In [16]:
# Implement the KNN algorithm from scratch

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

# Initialize the KNN model with k=3
knn = KNN(k=3)
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Print the predictions
print("Predictions:", y_pred)
print("Actual:", y_test.values)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Confusion matrix
confusion = confusion_matrix(y_test, y_pred)

# Classification report
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", class_report)

# Function to find the best k
def find_best_k(X_train, y_train, X_test, y_test, max_k=20):
    accuracies = []
    for k in range(1, max_k + 1):
        knn = KNN(k=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append((k, accuracy))
        print(f"k={k}, Accuracy={accuracy}")
    return accuracies

# Find the best k value
accuracies = find_best_k(X_train, y_train, X_test, y_test)
best_k = max(accuracies, key=lambda x: x[1])[0]
print(f"Best k value: {best_k}")

# Using Different Distance Metrics (Manhattan Distance)
def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

class KNN_M:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        distances = [manhattan_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

# Initialize the KNN model with Manhattan distance and best k
knn_m = KNN_M(k=best_k)
knn_m.fit(X_train, y_train)

# Predict on the test set
y_pred_m = knn_m.predict(X_test)

# Evaluate the model
accuracy_m = accuracy_score(y_test, y_pred_m)
confusion_m = confusion_matrix(y_test, y_pred_m)
class_report_m = classification_report(y_test, y_pred_m)

print("Accuracy with Manhattan Distance:", accuracy_m)
print("Confusion Matrix with Manhattan Distance:\n", confusion_m)
print("Classification Report with Manhattan Distance:\n", class_report_m)

Predictions: [0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0
 0 1 0 0 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0
 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 1]
Actual: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
Accuracy: 0.6818181818181818
Confusion Matrix:
 [[74 25]
 [24 31]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.76      0.75      0.75      