## K- Nearest Neighbors

In [1]:
#Reading train data
import pandas as pd
train = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/train.csv', index_col = 0)
train.head()

Unnamed: 0,AWAPART,AWABEDR,AWALAND,APERSAUT,ABESAUT,AMOTSCO,AVRAAUT,AAANHANG,ATRACTOR,AWERKT,...,PPERSONG,PGEZONG,PWAOREG,PBRAND,PZEILPL,PPLEZIER,PFIETS,PINBOED,PBYSTAND,CARAVAN
1,1,1,1,2,1,1,1,1,1,1,...,1,1,1,6,1,1,1,1,1,0
2,3,1,1,1,1,1,1,1,1,1,...,1,1,1,3,1,1,1,1,1,0
3,2,1,1,2,1,1,1,1,1,1,...,1,1,1,3,1,1,1,1,1,0
4,1,1,1,2,1,1,1,1,1,1,...,1,1,1,3,1,1,1,1,1,0
5,1,1,1,1,1,1,1,1,1,1,...,1,1,1,7,1,1,1,1,1,0


In [2]:
#Train data being split in X1 and Y1
X1 = train.loc[:, train.columns != 'CARAVAN']
y1 = train['CARAVAN'].astype('category')
y1

1       0
2       0
3       0
4       0
5       0
       ..
5818    0
5819    0
5820    1
5821    0
5822    0
Name: CARAVAN, Length: 5822, dtype: category
Categories (2, int64): [0, 1]

In [3]:
#Reading test data and splitting into X and y
test = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/test.csv', index_col = 0)
X = test.loc[:, test.columns != 'CARAVAN']
Y = test['CARAVAN'].astype('category')
Y

5823    0
5824    1
5825    0
5826    0
5827    0
       ..
9818    0
9819    1
9820    0
9821    0
9822    0
Name: CARAVAN, Length: 4000, dtype: category
Categories (2, int64): [0, 1]

In [4]:
#importing train and test split to split the train data into training and validation
#this is to perform hold out cross validation
from sklearn.model_selection import train_test_split
#Splitting data into 80-20 split
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.20)

In [7]:
#Importing KNNclassifier to perform a random classification on train (of X1)
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_val)

In [12]:
#From sklearn importing confusion matrix and other metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score

print(f'Confusion Matrix for KNN: \n{confusion_matrix(y_val, y_pred)}')
print(f'Accuracy for KNN: {accuracy_score(y_val,y_pred)}')
print(f'Area Under Curve for KNN: {roc_auc_score(y_val, y_pred)}')
print(f'Recall score for KNN: {recall_score(y_val,y_pred)}')
print(f'Precision score for KNN: {precision_score(y_val,y_pred)}')
print(f'F1 score for KNN: {metrics.f1_score(y_val,y_pred)}')
#Recall can be seen as a very low value as only 1 TP exists 
#hence tuning the parameters

Confusion Matrix for KNN: 
[[1085   11]
 [  68    1]]
Accuracy for KNN: 0.9321888412017167
Area Under Curve for KNN: 0.5022281286364118
Recall score for KNN: 0.014492753623188406
Precision score for KNN: 0.08333333333333333
F1 score for KNN: 0.02469135802469136


In [13]:
from sklearn import metrics

#Optimal number of neighbors = k
optimal_k = 0
#Optimal score
optimal_score = 0
#Number of neighbors
sequence =  list(range(1, 39, 2))

#Optimizing F1
for k in sequence:
    #Fitting the model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    #Predicting the model on validation data
    knn_y_pred = knn_model.predict(X_val)
    
    #F1 score 
    f1 = metrics.f1_score(y_val, knn_y_pred)
    #Checking if f1 score is greater than previous optimal score, updating k
    if f1 > optimal_score:
        optimal_k = k
        optimal_score = f1
        
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)
knn_model.fit(X_train, y_train)
best_knn_pred = knn_model.predict(X_val)
print(confusion_matrix(y_val, best_knn_pred))

[[1033   63]
 [  64    5]]


In [14]:
optimal_k

1

In [20]:
from sklearn import metrics
#running the best model with optimal k
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)
#Predicting on test data
best_knn_pred = knn_model.predict(X)
print(confusion_matrix(Y, best_knn_pred))
metrics.f1_score(Y, best_knn_pred)

[[3396  366]
 [ 186   52]]


0.15853658536585366

### SMOTE data

In [16]:
#Performing the same operations on SMOTE data
import pandas as pd
smote_data = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/SMOTE_traindata.csv', index_col = 0)

In [17]:
X1 = smote_data.loc[:, smote_data.columns != 'CARAVAN']
y1 = smote_data['CARAVAN'].astype('category')
y1

3711.0     0
1038.0     0
2325.0     0
2864.0     0
4921.0     0
          ..
5564.0     1
55651.0    1
55661.0    1
55671.0    1
55681.0    1
Name: CARAVAN, Length: 17052, dtype: category
Categories (2, int64): [0, 1]

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.20)

In [19]:
from sklearn import metrics

#Optimal number of neighbors = k
optimal_k = 0
#Optimal score
optimal_score = 0
#Number of neighbors
sequence =  list(range(1, 39, 2))

#Optimizing F1
for k in sequence:
    #Fitting the model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    #Predicting the model on validation data
    knn_y_pred = knn_model.predict(X_val)
    
    #F1 score 
    f1 = metrics.f1_score(y_val, knn_y_pred)
    #Checking if f1 score is greater than previous optimal score, updating k
    if f1 > optimal_score:
        optimal_k = k
        optimal_score = f1
        
knn_model = KNeighborsClassifier(n_neighbors=optimal_k)
knn_model.fit(X_train, y_train)
best_knn_pred = knn_model.predict(X_val)
print(confusion_matrix(y_val, best_knn_pred))

[[2158   49]
 [  43 1161]]


In [21]:
optimal_k

1

In [22]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X1, y1)
best_knn_pred = knn_model.predict(X)
print(confusion_matrix(Y, best_knn_pred))

[[3385  377]
 [ 186   52]]


In [23]:
metrics.f1_score(Y, best_knn_pred)

0.15592203898050977