# Simple and K-Fold cross-validation

In [1]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
# data preprocessing

# define column names
names = ['x', 'y', 'class']

# loading training data
df = pd.read_csv('3.concertriccir2.csv', header=None, names=names)
df.head()

Unnamed: 0,x,y,class
0,0.700335,-0.247068,0.0
1,-3.950019,2.74008,1.0
2,0.150222,-2.157638,1.0
3,-1.67205,-0.941519,1.0
4,2.560483,-1.846577,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
x        500 non-null float64
y        500 non-null float64
class    500 non-null float64
dtypes: float64(3)
memory usage: 11.8 KB


In [4]:
df['class'].value_counts()

1.0    250
0.0    250
Name: class, dtype: int64

In [6]:
# create design matrix X and target vector y
X = np.array(df.iloc[:,:2]) # end index is exclusive
y = np.array(df['class']) # another way of indexing pandas dataframe column
print(X.shape)
print(y.shape)

(500, 2)
(500,)


## Simple Cross Validation

In [14]:
# split the dataset into train and test
X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.3)

# split the train dataset into crossvalidation train and crossvalidation test
X_tr, X_cv, y_tr, y_cv = train_test_split(X_1, y_1, test_size=0.3)

for i in range(1,30,2):
    # instantiate learning model (k=i)
    knn = KNeighborsClassifier(n_neighbors=i)
    
    # fitting the model on train data
    knn.fit(X_tr, y_tr)
    
    # predict the response on the crossvalidation data
    pred = knn.predict(X_cv)
    
    # evaluate CV accuracy
    acc = accuracy_score(y_cv, pred, normalize=True) * float(100)
    print(f'\nCV accuracy for k = {i} is {round(acc)}%')


CV accuracy for k = 1 is 89.0%

CV accuracy for k = 3 is 89.0%

CV accuracy for k = 5 is 88.0%

CV accuracy for k = 7 is 84.0%

CV accuracy for k = 9 is 84.0%

CV accuracy for k = 11 is 83.0%

CV accuracy for k = 13 is 84.0%

CV accuracy for k = 15 is 83.0%

CV accuracy for k = 17 is 81.0%

CV accuracy for k = 19 is 81.0%

CV accuracy for k = 21 is 80.0%

CV accuracy for k = 23 is 78.0%

CV accuracy for k = 25 is 78.0%

CV accuracy for k = 27 is 74.0%

CV accuracy for k = 29 is 75.0%


In [19]:
# training optimal model
optimal_k = 1
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_tr, y_tr)
pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred, normalize=True) * float(100)
print('Test accuracy for k = {} is {}%'.format(optimal_k, round(acc)))

Test accuracy for k = 1 is 87.0%
