# Import Packages

In [1]:
import os
import numpy as np
import pandas as pd
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

# Load Dataset

In [2]:
data = pd.read_csv('car.data', names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'])
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Check Information of Dataset

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
class       1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


# Identify The Target Variable

In [4]:
data['class'], class_names = pd.factorize(data['class'])

- The target variable is marked as a class in the data frame. 
- The values are present in a string format. 
- However, the algorithm requires the variables to be coded into its equivalent integer codes. 
- We can convert the string categorical values into integer codes using factorize method of the pandas library.

In [5]:
print(class_names)
print(data['class'].unique())

Index(['unacc', 'acc', 'vgood', 'good'], dtype='object')
[0 1 2 3]


# Identify the Predictor Variables and Encode Any String Variables to Equivalent Integer Codes

In [6]:
data['buying'],_ = pd.factorize(data['buying'])
data['maint'],_ = pd.factorize(data['maint'])
data['doors'],_ = pd.factorize(data['doors'])
data['persons'],_ = pd.factorize(data['persons'])
data['lug_boot'],_ = pd.factorize(data['lug_boot'])
data['safety'],_ = pd.factorize(data['safety'])
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0


Check Data Types. Everything is now converted in integer form

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null int64
maint       1728 non-null int64
doors       1728 non-null int64
persons     1728 non-null int64
lug_boot    1728 non-null int64
safety      1728 non-null int64
class       1728 non-null int64
dtypes: int64(7)
memory usage: 94.6 KB


# Select The Predictor Feature and The Target Variable

In [8]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# Train Test Split

In [9]:
# split data randomly into 70% training and 30% test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Training / Model Fitting

In [10]:
model = KNeighborsClassifier(n_neighbors=5)   # train the decision tree, Instantiate the model with 5 neighbors. 
model.fit(X_train, y_train)                   # Fit the model on the training data.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# Model Parameters Study

In [11]:
# use the model to make predictions with the test data
y_pred = model.predict(X_test)
# how did our model perform?
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))

import sklearn.metrics as metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 32
Accuracy: 0.94


# How to Decide the Value of k-neighbors
- Choosing a large value of K will lead to a greater amount of execution time & underfitting. 
- Selecting the small value of K will lead to overfitting. There is no such guaranteed way to find the best value of K.

In [12]:
from sklearn.metrics import accuracy_score
for K in range(25):
    K_value = K+1
    neigh = KNeighborsClassifier(n_neighbors = K_value)
    neigh.fit(X_train, y_train) 
    y_pred = neigh.predict(X_test)
    print("Accuracy is ", accuracy_score(y_test,y_pred)*100,"% for K-Value:",K_value)

Accuracy is  83.62235067437379 % for K-Value: 1
Accuracy is  80.15414258188824 % for K-Value: 2
Accuracy is  89.21001926782274 % for K-Value: 3
Accuracy is  88.82466281310212 % for K-Value: 4
Accuracy is  93.83429672447014 % for K-Value: 5
Accuracy is  92.8709055876686 % for K-Value: 6
Accuracy is  92.8709055876686 % for K-Value: 7
Accuracy is  89.78805394990366 % for K-Value: 8
Accuracy is  90.94412331406551 % for K-Value: 9
Accuracy is  88.82466281310212 % for K-Value: 10
Accuracy is  89.40269749518305 % for K-Value: 11
Accuracy is  88.6319845857418 % for K-Value: 12
Accuracy is  88.82466281310212 % for K-Value: 13
Accuracy is  89.01734104046243 % for K-Value: 14
Accuracy is  89.78805394990366 % for K-Value: 15
Accuracy is  88.6319845857418 % for K-Value: 16
Accuracy is  88.82466281310212 % for K-Value: 17
Accuracy is  88.4393063583815 % for K-Value: 18
Accuracy is  88.6319845857418 % for K-Value: 19
Accuracy is  88.6319845857418 % for K-Value: 20
Accuracy is  88.2466281310212 % for 

It shows that we are getting a 93.83% accuracy on K = 5. Hence, we are considering K = 5 for this example.