### K Nearest Neighbors with Python

### Import Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

### Get the Data

Set index_col=0 to use the first column as the index.

In [2]:
df = pd.read_csv('ClassifiedData.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ,TARGET CLASS
0,0.913917,1.162073,0.567946,0.755464,0.780862,0.352608,0.759697,0.643798,0.879422,1.231409,1
1,0.635632,1.003722,0.535342,0.825645,0.924109,0.64845,0.675334,1.013546,0.621552,1.492702,0
2,0.72136,1.201493,0.92199,0.855595,1.526629,0.720781,1.626351,1.154483,0.957877,1.285597,0
3,1.234204,1.386726,0.653046,0.825624,1.142504,0.875128,1.409708,1.380003,1.522692,1.153093,1
4,1.279491,0.94975,0.62728,0.668976,1.232537,0.703727,1.115596,0.646691,1.463812,1.419167,1


### Standardize the Variables

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.



In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [20]:
scaler.fit(df.drop('TARGET CLASS', axis=1))

StandardScaler()

In [27]:
scaled_features = scaler.transform(df.drop('TARGET CLASS', axis=1))

In [28]:
scaled_features

array([[-0.12354188,  0.18590747, -0.91343069, ..., -1.48236813,
        -0.9497194 , -0.64331425],
       [-1.08483602, -0.43034845, -1.02531333, ..., -0.20224031,
        -1.82805088,  0.63675862],
       [-0.78870217,  0.33931821,  0.30151137, ...,  0.28570652,
        -0.68249379, -0.37784986],
       ...,
       [ 0.64177714, -0.51308341, -0.17920486, ..., -2.36249443,
        -0.81426092,  0.11159651],
       [ 0.46707241, -0.98278576, -1.46519359, ..., -0.03677699,
         0.40602453, -0.85567   ],
       [-0.38765353, -0.59589427, -1.4313981 , ..., -0.56778932,
         0.3369971 ,  0.01034996]])

In [29]:
## Convert array into dataframe
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])

In [30]:
df_feat.head()

Unnamed: 0,WTT,PTI,EQW,SBI,LQE,QWG,FDJ,PJF,HQE,NXJ
0,-0.123542,0.185907,-0.913431,0.319629,-1.033637,-2.308375,-0.798951,-1.482368,-0.949719,-0.643314
1,-1.084836,-0.430348,-1.025313,0.625388,-0.444847,-1.152706,-1.129797,-0.20224,-1.828051,0.636759
2,-0.788702,0.339318,0.301511,0.755873,2.031693,-0.870156,2.599818,0.285707,-0.682494,-0.37785
3,0.982841,1.060193,-0.621399,0.625299,0.45282,-0.26722,1.750208,1.066491,1.241325,-1.026987
4,1.139275,-0.640392,-0.709819,-0.057175,0.822886,-0.936773,0.596782,-1.472352,1.040772,0.27651


### Train Test Split

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['TARGET CLASS'],
                                                    test_size=0.30)

In [32]:
X_train, X_test

(array([[ 0.16570026,  0.35292484, -0.30151154, ...,  1.27534145,
          0.39529289,  0.30070103],
        [-0.5583311 , -1.57695199, -1.10316603, ..., -1.23227084,
         -0.4879403 ,  1.04860715],
        [ 0.30975918, -0.00621119,  0.22537485, ...,  1.09118609,
         -0.54895705,  1.08965761],
        ...,
        [ 0.6956278 ,  0.42602327, -1.19059731, ...,  0.78527952,
          0.67816209,  0.5949835 ],
        [ 1.11235188, -0.70990792, -0.20754235, ..., -0.52409787,
          0.09625454,  1.82545449],
        [-0.01728437,  0.73970028,  0.32134498, ..., -0.2066558 ,
         -0.18431576,  2.46094366]]),
 array([[ 1.34666813, -1.35580295, -0.21303272, ..., -1.65703742,
         -0.61716949,  0.69072876],
        [ 0.28582207,  1.08602777, -0.27064964, ..., -0.68773227,
          0.67979206, -1.04821401],
        [-0.5630683 , -0.98269676, -0.88946787, ..., -1.03353612,
          0.2434662 ,  0.7864487 ],
        ...,
        [-0.18774442,  1.81926156,  0.44770277, ...,  

In [33]:
X_train.shape, X_test.shape

((700, 10), (300, 10))

### Using KNN

Remember that we are trying to come up with a model to predict whether someone will TARGET CLASS or not. We'll start with k=1.

In [34]:
from sklearn.neighbors import KNeighborsClassifier

In [35]:
knn = KNeighborsClassifier(n_neighbors=1)

In [36]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [37]:
pred = knn.predict(X_test)

In [38]:
from sklearn.metrics import confusion_matrix, classification_report

In [40]:
print(confusion_matrix(y_test, pred))

[[149  11]
 [  7 133]]


In [41]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       160
           1       0.92      0.95      0.94       140

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300



### Choosing a K Value

Let's go ahead and use the elbow method to pick a good K Value: