In [1]:
# import required libraries
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/car_evaluation.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.shape

(1728, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   int64 
 3   persons   1728 non-null   int64 
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   outcome   1728 non-null   object
dtypes: int64(2), object(5)
memory usage: 94.6+ KB


In [5]:
df.outcome.value_counts()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
unacc,1210
acc,384
good,69
vgood,65


In [6]:
df.isnull().sum()

Unnamed: 0,0
buying,0
maint,0
doors,0
persons,0
lug_boot,0
safety,0
outcome,0


In [7]:
df.head(3)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc


In [8]:
# encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [9]:
df.buying = le.fit_transform(df.buying)
df.maint = le.fit_transform(df.maint)
df['lug_boot'] = le.fit_transform(df['lug_boot'])
df.safety = le.fit_transform(df.safety)

In [10]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,3,3,2,2,2,1,unacc
1,3,3,2,2,2,2,unacc
2,3,3,2,2,2,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc


In [11]:
# X and y split
X = df.iloc[:,:-1]
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [12]:
y = df.outcome
y.head()

Unnamed: 0,outcome
0,unacc
1,unacc
2,unacc
3,unacc
4,unacc


In [13]:
# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 47)

In [14]:
x_train.shape

(1209, 6)

In [15]:
x_test.shape

(519, 6)

In [16]:
y_train.value_counts()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
unacc,848
acc,264
good,50
vgood,47


In [17]:
# model building

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [18]:
model.fit(x_train,y_train)

In [19]:
y_pred = model.predict(x_test)


In [22]:
y_pred1 = model.predict(x_train)

In [25]:
from sklearn.metrics import accuracy_score, classification_report

In [21]:
accuracy_score(y_test,y_pred)

0.8978805394990366

In [23]:
accuracy_score(y_train,y_pred1)

0.9511993382961125

In [24]:
pd.crosstab(y_test,y_pred)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,97,0,22,1
good,9,9,1,0
unacc,11,0,351,0
vgood,8,1,0,9


In [26]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         acc       0.78      0.81      0.79       120
        good       0.90      0.47      0.62        19
       unacc       0.94      0.97      0.95       362
       vgood       0.90      0.50      0.64        18

    accuracy                           0.90       519
   macro avg       0.88      0.69      0.75       519
weighted avg       0.90      0.90      0.89       519



In [27]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,3,3,2,2,2,1,unacc
1,3,3,2,2,2,2,unacc
2,3,3,2,2,2,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc


In [29]:
model.predict([[1,2,2,2,1,1]])



array(['unacc'], dtype=object)

In [30]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,3,3,2,2,2,1,unacc
1,3,3,2,2,2,2,unacc
2,3,3,2,2,2,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc
...,...,...,...,...,...,...,...
1723,1,1,5,6,1,2,good
1724,1,1,5,6,1,0,vgood
1725,1,1,5,6,0,1,unacc
1726,1,1,5,6,0,2,good


In [31]:
model.predict([[1,1,5,6,0,2]])



array(['good'], dtype=object)

In [32]:
model.predict([[1,1,5,6,0,0]])



array(['vgood'], dtype=object)

In [33]:
model.predict([[1,3,3,1,4,0]])



array(['unacc'], dtype=object)

## Balancing Technique

In [34]:
from imblearn.over_sampling import SMOTE

In [35]:
smote = SMOTE()

In [36]:
x_train_smote, y_train_smote = smote.fit_resample(x_train,y_train)

In [37]:
y_train.value_counts()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
unacc,848
acc,264
good,50
vgood,47


In [38]:
y_train_smote.value_counts()

Unnamed: 0_level_0,count
outcome,Unnamed: 1_level_1
unacc,848
acc,848
good,848
vgood,848


In [39]:
knn =KNeighborsClassifier()

In [40]:
knn.fit(x_train_smote,y_train_smote)

In [42]:
y_predict = knn.predict(x_test)

In [43]:
accuracy_score(y_test,y_predict)

0.8420038535645472

In [44]:
pd.crosstab(y_test,y_predict)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,101,10,7,2
good,4,14,1,0
unacc,46,4,311,1
vgood,6,1,0,11


In [45]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

         acc       0.64      0.84      0.73       120
        good       0.48      0.74      0.58        19
       unacc       0.97      0.86      0.91       362
       vgood       0.79      0.61      0.69        18

    accuracy                           0.84       519
   macro avg       0.72      0.76      0.73       519
weighted avg       0.87      0.84      0.85       519

