In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [27]:
mushroom_data = pd.read_csv('./data/mushroom.csv')

In [28]:
mushroom_data.head()  # 데이터셋의 상위 5개 행 출력

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [29]:
mushroom_data.isnull().sum()

poisonous                   0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [30]:
mushroom_data.columns

Index(['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [31]:
# 특성별 value_counts 확인
def value_counts(mushroom_data):
    col=mushroom_data.columns
    for c in col:
        print(f'\n[[[[[ {c}의 value_count ]]]]]\
            \n{mushroom_data[c].value_counts()}')

In [32]:
value_counts(mushroom_data)


[[[[[ poisonous의 value_count ]]]]]            
e    4208
p    3916
Name: poisonous, dtype: int64

[[[[[ cap-shape의 value_count ]]]]]            
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64

[[[[[ cap-surface의 value_count ]]]]]            
y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

[[[[[ cap-color의 value_count ]]]]]            
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: cap-color, dtype: int64

[[[[[ bruises의 value_count ]]]]]            
f    4748
t    3376
Name: bruises, dtype: int64

[[[[[ odor의 value_count ]]]]]            
n    3528
f    2160
y     576
s     576
a     400
l     400
p     256
c     192
m      36
Name: odor, dtype: int64

[[[[[ gill-attachment의 value_count ]]]]]            
f    7914
a     210
Name: gill-attachment, dtype: int64

[[[[[ gill-spacing의 value_count ]]]]]            
c    6812
w    1312
Name: gill-spacing, dtype: int6

In [33]:
mushroom_data['poisonous']

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: poisonous, Length: 8124, dtype: object

In [34]:
mushroom_data['poisonous'] = mushroom_data['poisonous'].apply(lambda x: 1 if x == 'p' else 0)

In [38]:
mushroom_data['poisonous']

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: poisonous, Length: 8124, dtype: int64

In [35]:
mushroom_data = pd.get_dummies(mushroom_data, columns=mushroom_data.columns[1:], drop_first=True)

In [39]:
mushroom_data

Unnamed: 0,poisonous,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,1,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,0,0


In [36]:
X = mushroom_data.drop('poisonous', axis=1)
y = mushroom_data['poisonous']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
X_train

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
7873,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
6515,0,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
6141,0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,0,1,0,0,0,0
2764,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
438,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
5390,0,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
860,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
7603,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0


In [42]:
X_test

Unnamed: 0,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
1971,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
6654,0,1,0,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,0
5606,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
3332,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6988,0,1,0,0,0,0,1,0,0,1,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7374,0,0,1,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,1,0,0
1149,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4999,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
7497,0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0


In [41]:
y_train

7873    1
6515    1
6141    1
2764    0
438     0
       ..
5226    1
5390    0
860     0
7603    1
7270    0
Name: poisonous, Length: 6499, dtype: int64

In [43]:
y_test

1971    0
6654    1
5606    1
3332    0
6988    1
       ..
7374    1
1149    0
4999    1
7497    1
3341    1
Name: poisonous, Length: 1625, dtype: int64

In [37]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
