#  문제정의
- 버섯의 특징을 사용해서 독/ 식용 버섯을 분류

## 목표
- Decision Tree 과대적합 제어/ 시각화/ 특성 선택

# 데이터 수집

In [1]:
import pandas as pd
data = pd.read_csv('./data/mushroom.csv')
data.head()

Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
data.shape

(8124, 23)

# 데이터 전처리 

##  결측치

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   poisonous                 8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

#  탐색적 데이터 분석

# 모델 선택 및 하이퍼 파라미터 튜닝

In [6]:
X = data.loc[:,'cap-shape' : ] #문제
y = data.loc[:, 'poisonous'] #정답

## 인코딩
- 글자 데이터를 수치 데이터로 변환하는 작업
- label encoding : 수치값으로 mapping하는 작업(딕셔너리 형태의 데이터를 키값을 벨류값으로 바꿔준다)
- one-hot encoding : 0 or 1 의 값을 가진 여르개의 새로운 특성으로 변경하는 작업

### label 인코딩

In [7]:
X1 = X.copy() # 새로운 주소값을 받기 때문에 x가 변해도 x1은 변하지 않는다!

In [9]:
X1['cap-shape'].unique()

array(['x', 'b', 's', 'f', 'k', 'c'], dtype=object)

In [11]:
X1['cap-shape']

0       x
1       x
2       b
3       x
4       x
       ..
8119    k
8120    x
8121    f
8122    k
8123    x
Name: cap-shape, Length: 8124, dtype: object

In [10]:
X1['cap-shape'].map({"x":0, "f":1, "k":2, "b":3, "s":4, "c":5})

0       0
1       0
2       3
3       0
4       0
       ..
8119    2
8120    0
8121    1
8122    2
8123    0
Name: cap-shape, Length: 8124, dtype: int64

In [12]:
X1['cap-shape'] = X1['cap-shape'].map({"x":0, "f":1, "k":2, "b":3, "s":4, "c":5})

###  onehot 인코딩

In [14]:
X2 = X.copy()

In [16]:
# onehot 인코딩 할 컬럼을 뽑아서 넣어주어야 한다
X_one_hot = pd.get_dummies(X2)
X_one_hot.head()

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


## 훈련과 평과로 데이터 분리

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_one_hot, y,
                                                   test_size = 0.3)

## 모델 불러오기

In [19]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()

# 학습

In [20]:
tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

# 평가

In [21]:
tree_model.score(X_train, y_train)

1.0

In [22]:
tree_model.score(X_test, y_test)

1.0