In [2]:
# Import numpy for numerical computations.
import numpy as np
# Import pandas for data manipulation and analysis.
import pandas as pd

# Import matplotlib for data visualization.
import matplotlib.pyplot as plt
# Import seaborn for enhanced data visualization and aesthetics.
import seaborn as sns
# Ensure plots are displayed inline in Jupyter Notebook.
%matplotlib inline

# Import KNeighborsClassifier for implementing the K-Nearest Neighbors algorithm.
from sklearn.neighbors import KNeighborsClassifier
# Import train_test_split for splitting the dataset into training and test sets.
from sklearn.model_selection import train_test_split
# Import SelectKBest and f_classif for feature selection using the F-test.
from sklearn.feature_selection import SelectKBest, f_classif
# Import StandardScaler for feature scaling (standardization).
from sklearn.preprocessing import StandardScaler
# Import accuracy_score to evaluate the accuracy of the model.
from sklearn.metrics import accuracy_score


# Load data from the diabetes.csv file.
dataset = pd.read_csv('diabetes.csv')

# Drop the 'Outcome' column from the dataset and store remaining columns in X (features).
X = dataset.drop('Outcome', axis = 1)
# Store the 'Outcome' column in y (target variable).
y = dataset['Outcome']

# Split the data into training and testing sets.
# The training set will comprise 80%, and the test set will make up 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


# Loop through the number of features to select (from 1 to 8)
for i in range (1,9):
    # Select the top i features based on F-test
    selector = SelectKBest(f_classif, k = i)
    # Apply feature selection to training data
    X_train_new = selector.fit_transform(X_train, y_train)
    # Apply the same selection to test data
    X_test_new = selector.transform(X_test)

    # Print the indices of the selected features
    print(selector.get_support(indices = True))

    # Standardize the selected features
    scaler = StandardScaler()
    # Standardize the training data
    X_train_new = scaler.fit_transform(X_train_new)
    # Standardize the test data
    X_test_new = scaler.transform(X_test_new)

    # Loop through different values of k (number of neighbors) for KNN
    for j in range (1,10):
        # Train the KNN model with the selected features and k neighbors
        knn = KNeighborsClassifier(n_neighbors = j)
        knn.fit(X_train_new, y_train)

        # Make predictions and evaluate the model accuracy
        y_pred = knn.predict(X_test_new)
        print("K = " + str(j) + " Accuracy:", accuracy_score(y_test, y_pred))


# Select the 'Pregnancies', 'Glucose', 'BMI', and 'Age' columns from the dataset as input features (X).
X = dataset[['Pregnancies','Glucose', 'BMI', 'Age']]
# Select the 'Outcome' column from the dataset as the target variable (y), representing the result to be predicted.
y = dataset['Outcome']
# Split the data into training and testing sets.
# The training set will comprise 80%, and the test set will make up 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Initialize the scaler.
scaler = StandardScaler()
# Standardize the training set.
X_train = scaler.fit_transform(X_train)
# Standardize the test set (using training set information).
X_test = scaler.transform(X_test)

# Initialize the KNN model.
knn = KNeighborsClassifier(n_neighbors = 6)
# Train the model on the training set.
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

[1]
K = 1 Accuracy: 0.6948051948051948
K = 2 Accuracy: 0.7077922077922078
K = 3 Accuracy: 0.7207792207792207
K = 4 Accuracy: 0.7727272727272727
K = 5 Accuracy: 0.7467532467532467
K = 6 Accuracy: 0.7402597402597403
K = 7 Accuracy: 0.7272727272727273
K = 8 Accuracy: 0.7402597402597403
K = 9 Accuracy: 0.7402597402597403
[1 5]
K = 1 Accuracy: 0.7337662337662337
K = 2 Accuracy: 0.7467532467532467
K = 3 Accuracy: 0.7467532467532467
K = 4 Accuracy: 0.7597402597402597
K = 5 Accuracy: 0.7532467532467533
K = 6 Accuracy: 0.7857142857142857
K = 7 Accuracy: 0.7402597402597403
K = 8 Accuracy: 0.7792207792207793
K = 9 Accuracy: 0.7402597402597403
[1 5 7]
K = 1 Accuracy: 0.7337662337662337
K = 2 Accuracy: 0.7272727272727273
K = 3 Accuracy: 0.7467532467532467
K = 4 Accuracy: 0.7662337662337663
K = 5 Accuracy: 0.8051948051948052
K = 6 Accuracy: 0.7922077922077922
K = 7 Accuracy: 0.8051948051948052
K = 8 Accuracy: 0.7792207792207793
K = 9 Accuracy: 0.7857142857142857
[0 1 5 7]
K = 1 Accuracy: 0.772727272