In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('car.data', header=None)

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [4]:
features = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
df.columns = features

In [5]:
X = df.drop(["class"], axis=1)
y = df["class"] # 0, 1, 2, 3 # unacc:0x~~~~~, acc:0x~~~ 10000, 10002, 100000

In [6]:
X

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1723,low,low,5more,more,med,med
1724,low,low,5more,more,med,high
1725,low,low,5more,more,big,low
1726,low,low,5more,more,big,med


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


In [8]:
X["buying"].value_counts()

vhigh    432
med      432
high     432
low      432
Name: buying, dtype: int64

In [9]:
X["maint"].value_counts()

vhigh    432
med      432
high     432
low      432
Name: maint, dtype: int64

In [10]:
X["doors"].value_counts()

5more    432
4        432
2        432
3        432
Name: doors, dtype: int64

In [11]:
X["persons"].value_counts()

4       576
more    576
2       576
Name: persons, dtype: int64

In [12]:
X["lug_boot"].value_counts()

big      576
med      576
small    576
Name: lug_boot, dtype: int64

In [13]:
X["safety"].value_counts()

med     576
high    576
low     576
Name: safety, dtype: int64

In [14]:
y[:3]

0    unacc
1    unacc
2    unacc
Name: class, dtype: object

In [15]:
X[:3]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 520 to 684
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1296 non-null   object
 1   maint     1296 non-null   object
 2   doors     1296 non-null   object
 3   persons   1296 non-null   object
 4   lug_boot  1296 non-null   object
 5   safety    1296 non-null   object
dtypes: object(6)
memory usage: 70.9+ KB


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(
     [("onehot", OneHotEncoder(sparse=False), 
       ["buying", "maint", "doors", "persons", "lug_boot", "safety"])])
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
X_test_trans = ct.transform(X_test)

In [19]:
X_train[:3]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
520,high,vhigh,5more,2,big,med
621,high,high,5more,2,small,low
1017,med,high,3,more,small,low


In [20]:
X_train_trans[:3]

array([[1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1.,
        0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
        0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 1., 0., 1., 0.]])

In [21]:
X_test[:3]

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1318,low,vhigh,2,more,med,med
124,vhigh,high,2,4,big,med
648,high,med,2,2,small,low


In [42]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(random_state=0)
model_lr.fit(X_train_trans, y_train)
print("{:.2f}%".format(100.0*model_lr.score(X_test_trans, y_test)))

88.43%


In [43]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=0)
model_rf.fit(X_train_trans, y_train)
print("{:.2f}%".format(100.0*model_rf.score(X_test_trans, y_test)))

96.76%


In [48]:
for k in [105, 200, 210, 50, 150]:
    print(y_test.iloc[k], "\t",
          model_rf.predict(X_test_trans[k].reshape(1, -1)))

acc 	 ['acc']
unacc 	 ['unacc']
unacc 	 ['unacc']
unacc 	 ['unacc']
acc 	 ['acc']
