# K-Nearest Neighbors

* K-nearest neighbors (K-NN) predicts the classification of new data sample(s) depending on the proximity of the new data sample(s) to the existing data

* K represents the number of neighbors considered to determine the label for the new data sample

* The predicted label is based on a voting method, so an odd number of neighbors is preferred to ensure there are no ties


## Example: Synthetic Dataset

<font color='Blue'><b>Example</b></font>: The synthetic dataset is generated using the `make_blobs` function from scikit-learn, designed for creating artificial datasets for various machine learning experiments. This particular dataset consists of the following characteristics:

- **Number of Samples:** 2000
- **Number of Features:** 2
- **Number of Classes:** 4
- **Random Seed (random_state):** 0
- **Cluster Standard Deviation (cluster_std):** 1.0

**Features:**
- The dataset contains 2000 data points, each described by a pair of feature values. These features are represented as 'Feature 1' and 'Feature 2'.

**Outcome (Target Variable):**
- The dataset also includes a target variable called 'Outcome.' This variable assigns each data point to one of two distinct classes, identified as 'Class 0',  'Class 1',  'Class 2', and 'Class 3'.

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# Generate synthetic data
X, y = make_blobs(n_samples=2000, centers=4, random_state=0, cluster_std=1.0)

# Create a DataFrame
Data = pd.DataFrame(data=X, columns=['Feature %i' % (i + 1) for i in range(2)])
Data['Outcome'] = y

display(Data)

plt.style.use('https://raw.githubusercontent.com/HatefDastour/ENSF444/main/Files/mystyle.mplstyle')

# Create a scatter plot using Seaborn
fig, ax = plt.subplots(1, 1, figsize=(9.5, 7))

colors = ["#f5645a", "#b781ea", '#B2FF66', '#0096ff']
edge_colors = ['#8A0002', '#3C1F8B','#6A993D', '#2e658c']
markers = ['o', 's', 'd', '^']
cmap_ = ListedColormap(colors)

# Scatter plot of data points
for num in np.unique(y):
    ax.scatter(X[:, 0][y == num], X[:, 1][y == num], c=colors[num],
                s=40, edgecolors= edge_colors[num], marker=markers[num], label=str(num))

ax.set(xlim=[-6, 6], ylim=[-2, 12])
ax.legend()
ax.set_title('Synthetic Dataset', weight = 'bold', fontsize = 16)
plt.tight_layout()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=0, stratify=y)

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state=0, stratify=y)

import pandas as pd
import matplotlib.pyplot as plt

def _dist_plot(ax, y, CM=plt.cm.tab20c.colors, title=False):
    """
    Generate a pie chart illustrating the distribution of categories.

    Parameters:
    - ax: Axes object to plot on.
    - y: Input data for which the distribution is to be visualized.
    - label_mapping: Dictionary mapping category indices to labels.
    - CM: Color map for the pie chart.
    - title: Title for the plot. Set to False to omit.

    Returns:
    None
    """
    # Prepare data for the pie chart
    df = pd.Series(y).value_counts().to_frame('Count')

    # Create the pie chart
    wedges, texts, autotexts = ax.pie(df['Count'],
                                      autopct='%1.1f%%', startangle=140,
                                      colors=CM,
                                      explode=[0, 0, 0, 0.1],
                                      shadow=True, wedgeprops={'edgecolor': 'whitesmoke'})
    # Set title and ensure equal aspect ratio for a circular pie chart
    if title:
        _ = ax.set_title(title, fontsize=16, weight='bold')
    _ = ax.axis('equal')

    # Highlight the labels with annotations
    for text, autotext in zip(texts, autotexts):
        text.set_fontsize(12)
        text.set_fontweight('bold')
        autotext.set_fontsize(12)
        autotext.set_fontweight('bold')


# Create the figure and axes
fig, ax = plt.subplots(1, 2, figsize=(9, 4.5))
_dist_plot(ax[0], y_train, CM = plt.cm.Set3.colors)
_ = ax[0].set_title(f'Train Set (Size = {len(X_train)})', fontsize=12, weight='bold', color='Green')
_dist_plot(ax[1], y_test, CM = plt.cm.Set3.colors)
_ = ax[1].set_title(f'Test Set (Size = {len(X_test)})', fontsize=12, weight='bold', color='Blue')
_ = fig.suptitle('Distribution of Categories', fontsize=16, weight='bold')

# Adjust layout and display the plot
plt.tight_layout()

## Example 1: n_neighbors = 5

In [None]:
# Import the necessary class from scikit-learn
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier with 5 neighbors
KKN = KNeighborsClassifier(n_neighbors = 5)

# Fit the KNN classifier to the training data
KKN.fit(X_train, y_train)

def _gen_cr(model, X, y):
    y_pred = model.predict(X)
    Results = pd.DataFrame(classification_report(y, y_pred,
                                             output_dict=True)).T
    display(Results.style.format(precision = 3))

print('\nTrain Data:')
_gen_cr(KKN, X_train, y_train)

print('\nTest Data:')
_gen_cr(KKN, X_test, y_test)

from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay

fig, ax = plt.subplots(1, 2, figsize=(9.5, 5.5), sharey = False)

# Create a loop for train and test sets
for i, (X_set, y_set, title) in enumerate([(X_train, y_train, 'Train Set'), (X_test, y_test, 'Test Set')]):
    # Plot decision boundaries
    DecisionBoundaryDisplay.from_estimator(KKN, X_set, cmap=cmap_, ax=ax[i],
                                           alpha = 0.3, eps = 2,
                                           grid_resolution= 300,
                                           response_method="predict",
                                           plot_method="pcolormesh",
                                           xlabel='Feature 1', ylabel='Feature 2',
                                           shading="auto")
    ax[i].set(xlim =[-6, 6], ylim = [-3, 12])
    ax[i].set_aspect('equal')
    # Scatter plot of data points
    for num in np.unique(y):
        ax[i].scatter(X[:, 0][y == num], X[:, 1][y == num], c=colors[num],
                    s=40, edgecolors= edge_colors[num], marker=markers[num], label=str(num))

    ax[i].legend(title="Outcome")
    ax[i].set_title(f'{title} - KNN\n(neighbors = 5)', fontweight='bold', fontsize=14)
    ax[i].grid(False)

plt.tight_layout()


import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_cm(model, X_train, X_test, y_train, y_test, class_names, figsize=(7, 4), normalize = False):
    # Create a figure and axes for displaying confusion matrices side by side
    fig, ax = plt.subplots(1, 2, figsize=figsize)

    datasets = [(X_train, y_train, 'Train'), (X_test, y_test, 'Test')]

    for i in range(2):
        X, y, dataset_name = datasets[i]

        # Compute confusion matrix for the dataset predictions
        cm = confusion_matrix(y, model.predict(X))
        if normalize:
            cm = np.round(cm/cm.sum(axis = 1), 2)
            fig.suptitle('Confusion Matrices (Normalized)', fontsize=16, weight = 'bold')
        else:
            fig.suptitle('Confusion Matrices', fontsize=16, weight = 'bold')
        # Create a ConfusionMatrixDisplay and plot it on the respective axis
        cm_display = ConfusionMatrixDisplay(cm, display_labels=class_names)\
                        .plot(ax=ax[i],
                              im_kw=dict(cmap='Greens' if dataset_name == 'Train' else 'Blues'),
                              text_kw={"size": 16}, colorbar=False)
        ax[i].set_title(f'{dataset_name} Data')
        ax[i].grid(False)

    # Adjust the layout for better spacing
    plt.tight_layout()

plot_cm(KKN, X_train, X_test, y_train, y_test, np.unique(y).astype('str'), figsize=(8, 4))
plot_cm(KKN, X_train, X_test, y_train, y_test, np.unique(y).astype('str'), figsize=(8, 4), normalize=True)


# Calculate accuracy using accuracy_score
KKN_accuracy = KKN.score(X_train, y_train)
print(f"KKN Classifier Accuracy (Train): {KKN_accuracy:.4f}")
KKN_accuracy = KKN.score(X_test, y_test)
print(f"KKN Classifier Accuracy (Test): {KKN_accuracy:.4f}")

## Example 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Assuming X_train, y_train, X_test, y_test are defined

# Varying values of n_neighbors
n_neighbors_values = [1, 3, 5, 7, 10, 15, 20, 30, 50]

# Lists to store accuracy scores
train_accuracy_scores = []
test_accuracy_scores = []

# Iterate over different values of n_neighbors
for n_neighbors in n_neighbors_values:
    # Create a KNN classifier with the current n_neighbors
    KKN = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Fit the classifier to the training data
    KKN.fit(X_train, y_train)

    # Calculate accuracy scores and append to the lists
    train_accuracy = KKN.score(X_train, y_train)
    test_accuracy = KKN.score(X_test, y_test)

    train_accuracy_scores.append(train_accuracy)
    test_accuracy_scores.append(test_accuracy)

# Create a fixed figure and axes
fig, ax = plt.subplots(figsize=(8, 5))

# Plotting the accuracy scores
ax.plot(n_neighbors_values, train_accuracy_scores, label='Train Accuracy', marker='o', color='green')
ax.plot(n_neighbors_values, test_accuracy_scores, label='Test Accuracy', marker='o', color='blue')

ax.set_title('Accuracy Scores for Different Values of n_neighbors')
ax.set_xlabel('n_neighbors')
ax.set_ylabel('Accuracy Score')
ax.legend()
ax.grid(True)
plt.show()

<font color='Blue'><b>Example</b></font>. The Auto MPG dataset retrieved from the [UCI Machine Learning Repository](http://archive.ics.uci.edu/dataset/9/auto+mpg).

In [None]:
try:
    from ucimlrepo import fetch_ucirepo
except ImportError:
    !pip3 install -U ucimlrepo
    from ucimlrepo import fetch_ucirepo
import numpy as np

# fetch dataset
auto_mpg = fetch_ucirepo(name = 'Auto MPG')

# data (as pandas dataframes)
X = auto_mpg.data.features
y = auto_mpg.data.targets

# drop rows with missing values from X
X = X.dropna(axis=0, how='any')

# align X and y by index
X, y = X.align(y, join='inner', axis=0)

# ln(mpg)
y = np.log(y['mpg'])
y.name = 'ln(mpg)'
X = X['horsepower'].values.reshape(-1, 1)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Create a DataFrame to display the sizes of the training and testing sets
set_size_df = pd.DataFrame({'Size': [len(X_train), len(X_test)]}, index=['Train', 'Test'])
display(set_size_df.T)

In [None]:
from sklearn import neighbors
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

T = np.linspace(X.min(), X.max(), 200).reshape(-1, 1)

n_neighbors = 10

# Create a figure and axes for subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 6))
axes = axes.ravel()

# Define common style parameters
scatter_params = {'s': 40, 'ec': 'k', 'lw': 0.5, 'alpha': 0.5}
plot_params = {'color': 'navy', 'label': 'Predicted ln(MPG)', 'lw': 2}

for i, (ax, n_neighbors) in enumerate(zip(axes, [3, 5, 10, 15])):
    # Create KNeighborsRegressor
    knn = neighbors.KNeighborsRegressor(n_neighbors = n_neighbors, weights= 'uniform')
    _ = knn.fit(X_train, y_train)

    # Make predictions
    y_ = knn.predict(T)

    # Scatter plot for input features
    ax.scatter(X_train, y_train, label = 'Train Data',  c = '#6aa84f', **scatter_params)
    ax.scatter(X_test, y_test, label = 'Test Data',  c = '#f44336', **scatter_params)

    # Line plot for predictions
    ax.plot(T, y_, **plot_params)

    # Calculate Mean Squared Error (MSE)
    mse_train = mean_squared_error(y_train, knn.predict(X_train))
    mse_test = mean_squared_error(y_test, knn.predict(X_test))

    # Set common style parameters
    ax.axis("tight")
    ax.legend(fontsize=14)
    ax.set_title(f"KNeighborsRegressor (n_neighbors = {n_neighbors})", weight='bold')
    ax.set(xlim=[25, 250], ylim=[2, 4])

    # Display MSE at the bottom left of each plot
    ax.text(0.02, 0.08, f'MSE (Train): {mse_train:.4f}\nMSE (Test): {mse_test:.4f}',
            transform=ax.transAxes, fontsize=12, weight='bold',
            bbox=dict(facecolor='Whitesmoke', alpha=0.7))

# Adjust layout
plt.tight_layout()

In [None]:
from sklearn import neighbors
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

n_neighbors = 10

# Create a figure and axes for subplots
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(9.5, 6))

for i, (ax, weights) in enumerate(zip(axes, ["uniform", "distance"])):
    # Create KNeighborsRegressor
    knn = neighbors.KNeighborsRegressor(n_neighbors = n_neighbors, weights= weights)
    _ = knn.fit(X_train, y_train)

    # Make predictions
    y_ = knn.predict(T)

    # Scatter plot for input features
    ax.scatter(X_train, y_train, label = 'Train Data',  c = '#6aa84f', **scatter_params)
    ax.scatter(X_test, y_test, label = 'Test Data',  c = '#f44336', **scatter_params)

    # Line plot for predictions
    ax.plot(T, y_, **plot_params)

    # Calculate Mean Squared Error (MSE)
    mse_train = mean_squared_error(y_train, knn.predict(X_train))
    mse_test = mean_squared_error(y_test, knn.predict(X_test))

    # Set common style parameters
    ax.axis("tight")
    ax.legend(fontsize=14)
    ax.set_title(f"KNeighborsRegressor (n_neighbors = {n_neighbors} and weights = '{weights}')", weight='bold')
    ax.set(xlim=[25, 250], ylim=[2, 4])

    # Display MSE at the bottom left of each plot
    ax.text(0.02, 0.08, f'MSE (Train): {mse_train:.4f}\nMSE (Test): {mse_test:.4f}',
            transform=ax.transAxes, fontsize=12, weight='bold',
            bbox=dict(facecolor='Whitesmoke', alpha=0.7))

# Adjust layout
plt.tight_layout()