# Decision Tree

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [58]:
data_path = "data/abalone.csv"
label_path = "data/abalone_attributes.txt"

col = list()

with open( label_path,'r') as f:
    col = f.read().split('\n') 

abalone_data = pd.read_csv(data_path , names=col)
abalone_data.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [59]:
np_abalone_data = np.array(abalone_data)
print(np_abalone_data)

[['M' 0.455 0.365 ... 0.10099999999999999 0.15 15]
 ['M' 0.35 0.265 ... 0.0485 0.07 7]
 ['F' 0.53 0.42 ... 0.1415 0.21 9]
 ...
 ['M' 0.6 0.475 ... 0.2875 0.308 9]
 ['F' 0.625 0.485 ... 0.261 0.29600000000000004 10]
 ['M' 0.71 0.555 ... 0.3765 0.495 12]]


In [60]:
datax = np_abalone_data[:,1:]
datay = np_abalone_data[:,0]

In [61]:
print('datax\n', datax)
print('--------------------------------')
print('datay\n', datay)

datax
 [[0.455 0.365 0.095 ... 0.10099999999999999 0.15 15]
 [0.35 0.265 0.09 ... 0.0485 0.07 7]
 [0.53 0.42 0.135 ... 0.1415 0.21 9]
 ...
 [0.6 0.475 0.205 ... 0.2875 0.308 9]
 [0.625 0.485 0.15 ... 0.261 0.29600000000000004 10]
 [0.71 0.555 0.195 ... 0.3765 0.495 12]]
--------------------------------
datay
 ['M' 'M' 'F' ... 'M' 'F' 'M']


In [62]:
from sklearn.model_selection import train_test_split
trnx, tstx, trny, tsty = train_test_split(datax,datay,test_size=0.3)
print(trnx.shape, tstx.shape, trny.shape, tsty.shape)

(2923, 8) (1254, 8) (2923,) (1254,)


In [63]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(trnx)
trnx_scale = scaler.transform(trnx)
tstx_scale = scaler.transform(tstx)
print(np.min(trnx_scale[:,0]), np.max(trnx_scale[:,0]))
print(np.min(tstx_scale[:,0]), np.max(tstx_scale[:,0]))

0.0 1.0
0.04827586206896553 1.0206896551724136




In [64]:
from sklearn import tree
tree_model = tree.DecisionTreeClassifier(max_depth = 4, min_samples_split=3)
tree_model.fit(X = trnx, y=trny)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [65]:
tree_pred = tree_model.predict(X=tstx)

In [66]:
tree_model.feature_importances_

array([0.01520074, 0.00489924, 0.00354886, 0.05804017, 0.03711232,
       0.67037772, 0.        , 0.21082095])

In [67]:
#시각화 할 수 있는 트리 생성
from sklearn.tree import export_graphviz
export_graphviz(tree_model, out_file='abalone_tree.dot')

In [68]:
from sklearn.metrics import accuracy_score
print('accuracy:', accuracy_score(tsty, tree_pred) * 100,'(%)')

accuracy: 53.827751196172244 (%)
