# K-nearest Neighbour (KNN)

KNN is a `supervised machine learning` algorithm that can be used to solve both `classification and regression` problems. 

It is a `non-parametric`, lazy learning algorithm. `Non-parametric means that it does not make any assumptions on the underlying data distribution`. Lazy learning means that it does not require any training data points for model generation. All training data used in the testing phase. This makes training faster and testing phase slower and costlier.

KNN algorithm at the training phase just stores the dataset and when it gets new data, then it classifies that data into a category that is much similar to the new data.

## Classification using KNN

In [None]:
# Example of KNN classfier on IRIS data using SNS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the dataset
df = sns.load_dataset('iris')
df.head()

In [None]:
df['species'].unique()

In [None]:
df.info()

In [None]:
# split the data into X and y
X = df.drop('species', axis=1)
y = df['species']

In [None]:
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y, # By this, It will equally substitute each species type to y_train and y_test
                                                    )
model = KNeighborsClassifier(n_neighbors=11)
# fit the model on the training data
model.fit(X_train, y_train)
# predict the species for the test data
y_pred = model.predict(X_test)
print("Accuracy Score: ", model.score(X_test, y_test))

# evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# create df from confusion matrix result
confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred), 
                            columns=['setosa', 'versicolor', 'virginica'],
                            index=['setosa', 'versicolor', 'virginica']
                            )

# plot the confusion matrix
# sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='Blues')
sns.heatmap(data=confusion_df, annot=True, cmap='Blues')
plt.xlabel('------------ Predicted ------------')
plt.ylabel('------------ Actual ------------')
                                                    

## Tunning of 'n_neighbors' of knnclassifier

In [None]:
neighbors = np.arange(1, 12)
train_accuracies = {}
test_accuracies = {}

for neighbor in neighbors:
	knn = KNeighborsClassifier(n_neighbors=neighbor) # Set up a KNN Classifier
	knn.fit(X_train, y_train) # Fit the model
	train_accuracies[neighbor] = knn.score(X_train, y_train) # Compute accuracy for training dataset
	test_accuracies[neighbor] = knn.score(X_test, y_test) # Compute accuracy for testing dataset
print(neighbors, '\n', train_accuracies, '\n', test_accuracies)

print('-' * 100)

# Plot a line plot of train and test accuracies
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label="Training Accuracy") # Plot training accuracies
plt.plot(neighbors, test_accuracies.values(), label="Testing Accuracy") # Plot test accuracies
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
plt.show() # Display the plot

## Regression using KNN

## Tune neighbors

In [None]:
# Reggression problem on tips dataset
# load the dataset
tips = sns.load_dataset('tips')
tips.head()

In [None]:
# split the data into X and y
X = tips.drop('tip', axis=1)
y = tips['tip']

In [None]:
tips.info()

In [None]:
# encode the categorical columns using for lopp and le
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object' or X[col].dtype == 'category':
        X[col] = le.fit_transform(X[col])

In [None]:
# train test split the data and run the model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

model = KNeighborsRegressor(n_neighbors=5, metric='minkowski', p=2)

# fit the model on the training data
model.fit(X_train, y_train)

# predict the species for the test data
y_pred = model.predict(X_test)

# evaluate the model
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

In [None]:
X_test.head()

In [None]:
# predict a specific value
model.predict([[45, 1, 0, 1, 1, 3]])

---