## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [1]:
from sklearn import datasets, metrics

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [2]:
iris = datasets.load_iris()

x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=4)

clf = DecisionTreeClassifier()

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy:{acc}\n')

print(f'Feature Name:\n{iris.feature_names}\n')

print("Feature importance: ", clf.feature_importances_)

Accuracy:0.9736842105263158

Feature Name:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

Feature importance:  [0.         0.01796599 0.05992368 0.92211033]


In [3]:
# 可調整參數：1. criterion
clf_cri_gini = DecisionTreeClassifier(criterion='gini')
clf_cri_entropy = DecisionTreeClassifier(criterion='entropy')

clf_cri_gini.fit(x_train, y_train)
clf_cri_entropy.fit(x_train, y_train)

y_gini = clf_cri_gini.predict(x_test)
y_entropy = clf_cri_entropy.predict(x_test)

acc_gini = metrics.accuracy_score(y_test, y_gini)
acc_entropy = metrics.accuracy_score(y_test, y_entropy)
print(f'Accuracy by using Gini:{acc_gini}\n')
print(f'Accuracy by using Entropy:{acc_entropy}\n')

Accuracy by using Gini:0.9736842105263158

Accuracy by using Entropy:0.9736842105263158



In [4]:
# 可調整參數：2.Max_depth
clf_cri_depth_none = DecisionTreeClassifier(max_depth=None)
clf_cri_depth_5 = DecisionTreeClassifier(max_depth=5)
clf_cri_depth_50 = DecisionTreeClassifier(max_depth=50)
clf_cri_depth_500 = DecisionTreeClassifier(max_depth=500)

clf_cri_depth_none.fit(x_train, y_train)
clf_cri_depth_5.fit(x_train, y_train)
clf_cri_depth_50.fit(x_train, y_train)
clf_cri_depth_500.fit(x_train, y_train)

y_depth_none = clf_cri_depth_none.predict(x_test)
y_depth_5 = clf_cri_depth_5.predict(x_test)
y_depth_50 = clf_cri_depth_50.predict(x_test)
y_depth_500 = clf_cri_depth_500.predict(x_test)

acc_depth_none = metrics.accuracy_score(y_test, y_depth_none)
acc_depth_5 = metrics.accuracy_score(y_test, y_depth_5)
acc_depth_50 = metrics.accuracy_score(y_test, y_depth_50)
acc_depth_500 = metrics.accuracy_score(y_test, y_depth_500)

print(f'Accuracy by using Max_Depth None:{acc_depth_none}\n')
print(f'Accuracy by using Max_Depth 5:{acc_depth_5}\n')
print(f'Accuracy by using Max_Depth 50:{acc_depth_50}\n')
print(f'Accuracy by using Max_Depth 500:{acc_depth_500}\n')

Accuracy by using Max_Depth None:0.9736842105263158

Accuracy by using Max_Depth 5:0.9736842105263158

Accuracy by using Max_Depth 50:0.9736842105263158

Accuracy by using Max_Depth 500:0.9736842105263158



In [5]:
# 可調整參數：3.Min_samples_split
clf_cri_min_samples_split_2 = DecisionTreeClassifier(min_samples_split=2)
clf_cri_min_samples_split_5 = DecisionTreeClassifier(min_samples_split=5)
clf_cri_min_samples_split_50 = DecisionTreeClassifier(min_samples_split=50)
clf_cri_min_samples_split_500 = DecisionTreeClassifier(min_samples_split=500)

clf_cri_min_samples_split_2.fit(x_train, y_train)
clf_cri_min_samples_split_5.fit(x_train, y_train)
clf_cri_min_samples_split_50.fit(x_train, y_train)
clf_cri_min_samples_split_500.fit(x_train, y_train)

y_min_samples_split_2 = clf_cri_min_samples_split_2.predict(x_test)
y_min_samples_split_5 = clf_cri_min_samples_split_5.predict(x_test)
y_min_samples_split_50 = clf_cri_min_samples_split_50.predict(x_test)
y_min_samples_split_500 = clf_cri_min_samples_split_500.predict(x_test)

acc_min_samples_split_2 = metrics.accuracy_score(y_test, y_min_samples_split_2)
acc_min_samples_split_5 = metrics.accuracy_score(y_test, y_min_samples_split_5)
acc_min_samples_split_50 = metrics.accuracy_score(y_test, y_min_samples_split_50)
acc_min_samples_split_500 = metrics.accuracy_score(y_test, y_min_samples_split_500)

print(f'Accuracy by using min_samples_split = 2:{acc_min_samples_split_2}\n')
print(f'Accuracy by using min_samples_split = 5:{acc_min_samples_split_5}\n')
print(f'Accuracy by using min_samples_split = 50:{acc_min_samples_split_50}\n')
print(f'Accuracy by using min_samples_split = 500:{acc_min_samples_split_500}\n')

Accuracy by using min_samples_split = 2:0.9736842105263158

Accuracy by using min_samples_split = 5:0.9736842105263158

Accuracy by using min_samples_split = 50:0.9736842105263158

Accuracy by using min_samples_split = 500:0.21052631578947367



In [6]:
# 可調整參數：4.Min_samples_leaf
clf_cri_min_samples_leaf_2 = DecisionTreeClassifier(min_samples_leaf=2)
clf_cri_min_samples_leaf_5 = DecisionTreeClassifier(min_samples_leaf=5)
clf_cri_min_samples_leaf_50 = DecisionTreeClassifier(min_samples_leaf=50)
clf_cri_min_samples_leaf_500 = DecisionTreeClassifier(min_samples_leaf=500)

clf_cri_min_samples_leaf_2.fit(x_train, y_train)
clf_cri_min_samples_leaf_5.fit(x_train, y_train)
clf_cri_min_samples_leaf_50.fit(x_train, y_train)
clf_cri_min_samples_leaf_500.fit(x_train, y_train)

y_min_samples_leaf_2 = clf_cri_min_samples_leaf_2.predict(x_test)
y_min_samples_leaf_5 = clf_cri_min_samples_leaf_5.predict(x_test)
y_min_samples_leaf_50 = clf_cri_min_samples_leaf_50.predict(x_test)
y_min_samples_leaf_500 = clf_cri_min_samples_leaf_500.predict(x_test)

acc_min_samples_leaf_2 = metrics.accuracy_score(y_test, y_min_samples_leaf_2)
acc_min_samples_leaf_5 = metrics.accuracy_score(y_test, y_min_samples_leaf_5)
acc_min_samples_leaf_50 = metrics.accuracy_score(y_test, y_min_samples_leaf_50)
acc_min_samples_leaf_500 = metrics.accuracy_score(y_test, y_min_samples_leaf_500)

print(f'Accuracy by using min_samples_leaf = 2:{acc_min_samples_leaf_2}\n')
print(f'Accuracy by using min_samples_leaf = 5:{acc_min_samples_leaf_5}\n')
print(f'Accuracy by using min_samples_leaf = 50:{acc_min_samples_leaf_50}\n')
print(f'Accuracy by using min_samples_leaf = 500:{acc_min_samples_leaf_500}\n')

Accuracy by using min_samples_leaf = 2:0.9736842105263158

Accuracy by using min_samples_leaf = 5:0.9736842105263158

Accuracy by using min_samples_leaf = 50:0.7894736842105263

Accuracy by using min_samples_leaf = 500:0.21052631578947367



In [7]:
# 調整參數：2.Max_depth, 3.Min_samples_split, 4.Min_samples_leaf
clf_test = DecisionTreeClassifier(min_samples_split=500, min_samples_leaf=500)

clf_test.fit(x_train, y_train)

y_test_pred = clf_test.predict(x_test)

acc_test = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy:{acc_test}')

Accuracy:0.9736842105263158


* 1. 嘗試個別調整 criterion, max_depth, min_samples_split 與 min_samples_leaf，發現此例子只有調整 min_samples_split 與 min_samples_leaf 到 500 精準度才會下降。但是同時將這兩個數值「同時」調整為 500 的時候，精準度又會回到原本的數值。

In [8]:
from sklearn import linear_model

wine = datasets.load_wine()

x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=4)

linear = linear_model.LinearRegression()

linear.fit(x_train, y_train)

y_pred = linear.predict(x_test)

acc = metrics.mean_squared_error(y_test, y_pred)

print("LinearRegression Mean Squared error: %.3f" % acc)

LinearRegression Mean Squared error: 0.065


In [9]:
clf = DecisionTreeRegressor()

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = metrics.mean_squared_error(y_test, y_pred)
print("LinearRegression Mean Squared error: %.3f" %acc)
print("Gain: %.3f" % (0.044/0.065))

LinearRegression Mean Squared error: 0.022
Gain: 0.677


* 使用 DecisionTreeRegressor 進行 wine 的回歸分析，其 MSE = 0.044，相較於一般的線性回歸分析的 MSE=0.065，其精準度上升 33 %.