# Build the KNN model 

Now that the data is prepared we can build the model for predictions an evaluations. <br>
I will be using the K nearest neighbours model to find organisations that are similar to each other. 

## Model 2: Customer behaviour

This model will run KNN based on synthetic customer behaviour

In [None]:
# Import libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib.colors import ListedColormap
from mlxtend.plotting import plot_decision_regions
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sqlalchemy import create_engine

%matplotlib inline

In [None]:
engine = create_engine('sqlite:///../data/orgs_customer_behaviours.db')
df_normalised_features = pd.read_sql_table('synth_customer_behaviour_data', engine)
df_features = pd.read_sql_table('synth_customer_target_classifier_data', engine)

In [None]:
X = df_normalised_features
y = df_features['CONVERTED']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_neighbors = 1
knn = KNeighborsClassifier(n_neighbors=num_neighbors)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [None]:
print(f'WITH K={num_neighbors}\n')
print('\nCONFUSION MATRIX:')
print(confusion_matrix(y_test, pred))
print('\nCLASSIFICATION REPORT:')
print(classification_report(y_test,pred))

#### Improve the model with a better K value
The initial K value was arbitrary to prove the functioning of the model. <br>
Lets try to improve accuracy by finding a better K value for KNN

In [None]:
error_rate = []

for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
num_neighbors = 40
knn = KNeighborsClassifier(n_neighbors=num_neighbors)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

In [None]:
print(f'WITH K={num_neighbors}\n')
print('\nCONFUSION MATRIX:')
print(confusion_matrix(y_test, pred))
print('\nCLASSIFICATION REPORT:')
print(classification_report(y_test,pred))

In [None]:
grid_parameters = {
    'n_neighbors': [5, 10, 18, 20, 25, 35, 40],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs = GridSearchCV(KNeighborsClassifier(), grid_parameters, verbose=1, cv=3, n_jobs=-1)

gs_results = gs.fit(X_train, y_train)

print(f'\n\nBest scores found are {gs_results.best_score_}\n\n')
print(f'Best results are found using estimator {gs_results.best_estimator_}\n\n')

In [None]:
num_neighbors = 10
knn = KNeighborsClassifier(n_neighbors=num_neighbors, weights='distance', metric='manhattan')
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

print(f'WITH K={num_neighbors}\n')
print('\nCONFUSION MATRIX:')
print(confusion_matrix(y_test, pred))
print('\nCLASSIFICATION REPORT:')
print(classification_report(y_test,pred))

### PCA

Lets reduce the dimensions to see if we can improve the outcomes and vizualise the model for analysis.

In [None]:
pca = PCA(n_components=2)

X_train_n2 = pca.fit_transform(X_train)
knn.fit(X_train_n2, y_train)

print(f'WITH K={num_neighbors}\n')
print('\nCONFUSION MATRIX:')
print(confusion_matrix(y_test, pred))
print('\nCLASSIFICATION REPORT:')
print(classification_report(y_test,pred))

In [None]:
#  X_npa = X_train_n2.to_numpy()
y_npa = y_train.to_numpy()

plt.rcParams['figure.figsize'] = [25, 20]
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Knn with K='+ str(num_neighbors))
plot_decision_regions(X_train_n2, y_npa, clf=knn, legend=2, colors='blue,grey,darkblue,darkgrey', markers='^s')# Adding axes annotations
plt.show()