In [1]:
import numpy as np
import pandas as pd

In [2]:
url= 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
# 설명: https://rstudio-pubs-static.s3.amazonaws.com/118220_5a7997d6b0aa493c878d661968fc1f08.html
car_df = pd.read_csv(url, names=['buying','maint','doors','persons','lug_boot','safety','class'], sep=",")

car_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


종속변수 => Car acceptability (unacc, acc, good, vgood)

In [3]:
features = car_df.columns.tolist()
features.remove('class')
features 

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']

In [4]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [5]:
o_encoder = OrdinalEncoder()
data_encoded = o_encoder.fit_transform(car_df[features])
car_df_encoded = pd.DataFrame(data_encoded, columns=features)
data_encoded

array([[3., 3., 0., 0., 2., 1.],
       [3., 3., 0., 0., 2., 2.],
       [3., 3., 0., 0., 2., 0.],
       ...,
       [1., 1., 3., 2., 0., 1.],
       [1., 1., 3., 2., 0., 2.],
       [1., 1., 3., 2., 0., 0.]])

In [7]:
# o_encoder.categories_

[array(['high', 'low', 'med', 'vhigh'], dtype=object),
 array(['high', 'low', 'med', 'vhigh'], dtype=object),
 array(['2', '3', '4', '5more'], dtype=object),
 array(['2', '4', 'more'], dtype=object),
 array(['big', 'med', 'small'], dtype=object),
 array(['high', 'low', 'med'], dtype=object)]

In [11]:
l_encoder = LabelEncoder()
target_encoded = l_encoder.fit_transform(car_df['class'])
car_df_encoded['class'] = target_encoded

In [14]:
# l_encoder.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)

In [15]:
car_df_encoded.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3.0,3.0,0.0,0.0,2.0,1.0,2
1,3.0,3.0,0.0,0.0,2.0,2.0,2
2,3.0,3.0,0.0,0.0,2.0,0.0,2
3,3.0,3.0,0.0,0.0,1.0,1.0,2
4,3.0,3.0,0.0,0.0,1.0,2.0,2


In [16]:
data_np = car_df_encoded.values

In [17]:
X=data_np[:,:-1]
y=data_np[:,-1]

In [18]:
X.shape

(1728, 6)

In [13]:
y.shape

(1728,)

In [14]:
y

array([2., 2., 2., ..., 2., 1., 3.])

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

In [20]:
from sklearn.naive_bayes import CategoricalNB

In [21]:
model = CategoricalNB()
model.fit(X_train, y_train)

CategoricalNB()

In [22]:
model.score(X_test,y_test)

0.8208092485549133

In [23]:
y_predictions = model.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score, classification_report

In [25]:
accuracy_score(y_test, y_predictions)

0.8208092485549133

In [26]:
print(classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

         0.0       0.60      0.63      0.61       115
         1.0       0.38      0.12      0.18        25
         2.0       0.90      0.95      0.92       363
         3.0       0.78      0.44      0.56        16

    accuracy                           0.82       519
   macro avg       0.66      0.53      0.57       519
weighted avg       0.80      0.82      0.81       519

