# Decision Tree

In [1]:
from tree import tree, classify

car = ['med', 'med', '4', '4', 'big', 'med']
print(classify(car, tree))

acc


# 計算分類好壞
**是否在松山區?**分類，仍無法將2種類別資料完全區分開來，若依**是否近捷運?**分類，則可完全分出2種類別的資料，

可透過計算分類後 "不純度"，決定要以哪個**feature**來分類

<img src="https://github.com/MiaZhang17/MachineLearning/blob/main/piecures/gini_impurity.png?raw=true" alt="drawing" style="width:500px;"/>

## 計算 Gini Impurity
group {A, A, A, B}

$Impurity = 1 - (\frac{3}{4})^2 - (\frac{1}{4})^2 = 0.375$

## 計算 Information gain
一開始Impurity = 0.5

1.依 是否近捷運分類

- Information gain： $ 0.5 - 0 - 0 = 0.5$

2.依 是否在松山區分類
 - Information gain： $ 0.5 - 0.75 - 0.44 = -0.69$


## 增加權重
分類後各Subset內含有多少資料也很重要，即使是一樣的Impurity，數量愈多愈有意義

若一樣都是Impurity = 0，數量多代表此分類不是個**意外或不小心**分出來的結果，代表有很多筆資料都具有此種特性

<img src="https://github.com/MiaZhang17/MachineLearning/blob/main/piecures/gini_impurity_number.png?raw=true" alt="drawing" style="width:300px;"/>

依Subset數量計算權重

<img src="https://github.com/MiaZhang17/MachineLearning/blob/main/piecures/gini_impurity_weight.png?raw=true" alt="drawing" style="width:500px;"/>


一開始Impurity = 0.5

1.依 **是否近捷運**分類

 - Weight Information Gain = $0.5 - (\frac{5}{10} \times 0) - (\frac{5}{10} \times 0) = 0.5$

2.依 **是否在松山區**分類

 - Weight Information Gain = $0.5 - (\frac{4}{10} \times 0.375) - (\frac{6}{10} \times 0.44) = 0.086$



In [2]:
0.5 - (0.4*0.375) - (0.6*0.44)

0.08599999999999997

In [3]:
# Inpurity
from collections import Counter

labels = ["unacc", "unacc", "acc", "acc", "good", "good"]
#labels = ["unacc","unacc","unacc", "good", "vgood", "vgood"]
#labels = ["unacc", "unacc", "unacc", "unacc", "unacc", "unacc"]

impurity = 1
label_counts = Counter(labels)
print(label_counts)
for label in label_counts:
    probability_of_label = label_counts[label] / len(labels)
    impurity -= probability_of_label ** 2
    print(label, probability_of_label)
print(f'impurity:{impurity}')

Counter({'unacc': 2, 'acc': 2, 'good': 2})
unacc 0.3333333333333333
acc 0.3333333333333333
good 0.3333333333333333
impurity:0.6666666666666665


In [4]:
# Information gain
from collections import Counter

unsplit_labels = ["unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "good", "good", "vgood", "vgood", "vgood"]

split_labels_1 = [
  ["unacc", "unacc", "unacc", "unacc", "unacc", "unacc", "good", "good", "vgood"], 
  [ "good", "good"], 
  ["vgood", "vgood"]
]

split_labels_2 = [
  ["unacc", "unacc", "unacc", "unacc","unacc", "unacc", "good", "good", "good", "good"], 
  ["vgood", "vgood", "vgood"]
]

def gini(dataset):
    impurity = 1
    label_counts = Counter(dataset)
    for label in label_counts:
        prob_of_label = label_counts[label] / len(dataset)
        impurity -= prob_of_label ** 2
    return impurity

info_gain = gini(unsplit_labels)
for sub in split_labels_1:
    info_gain -= gini(sub)
print(info_gain)

info_gain = gini(unsplit_labels)
for sub in split_labels_2:
    info_gain -= gini(sub)
print(info_gain)

0.14522609394404257
0.15905325443786977


In [5]:
# weight Information gain

from collections import Counter

cars = [['med', 'low', '3', '4', 'med', 'med'], ['med', 'vhigh', '4', 'more', 'small', 'high'], ['high', 'med', '3', '2', 'med', 'low'], ['med', 'low', '4', '4', 'med', 'low'], ['med', 'low', '5more', '2', 'big', 'med'], ['med', 'med', '2', 'more', 'big', 'high'], ['med', 'med', '2', 'more', 'med', 'med'], ['vhigh', 'vhigh', '2', '2', 'med', 'low'], ['high', 'med', '4', '2', 'big', 'low'], ['low', 'low', '2', '4', 'big', 'med']]

car_labels = ['acc', 'acc', 'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'unacc', 'unacc', 'good']

def split(dataset, labels, column):
    data_subsets = []
    label_subsets = []
    counts = list(set([data[column] for data in dataset]))
    counts.sort()
    for k in counts:
        new_data_subset = []
        new_label_subset = []
        for i in range(len(dataset)):
            if dataset[i][column] == k:
                new_data_subset.append(dataset[i])
                new_label_subset.append(labels[i])
        data_subsets.append(new_data_subset)
        label_subsets.append(new_label_subset)
    return data_subsets, label_subsets

def gini(dataset):
    impurity = 1
    label_counts = Counter(dataset)
    for label in label_counts:
        prob_of_label = label_counts[label] / len(dataset)
        impurity -= prob_of_label ** 2
    return impurity

def information_gain(starting_labels, split_labels):
    info_gain = gini(starting_labels)
    count = len(starting_labels)
    for subset in split_labels:
        # Multiply gini(subset) by the correct percentage below
        info_gain -= gini(subset) * len(subset) / count
    return info_gain

split_data, split_labels = split(cars, car_labels, 3)
print(len(split_data))
print(split_data[0])
print(split_data[1])

result = information_gain(car_labels, split_labels)
print(result)

for i in range(6):
    split_data, split_labels = split(cars, car_labels, i)
    result = information_gain(car_labels, split_labels)
    print(result)

3
[['high', 'med', '3', '2', 'med', 'low'], ['med', 'low', '5more', '2', 'big', 'med'], ['vhigh', 'vhigh', '2', '2', 'med', 'low'], ['high', 'med', '4', '2', 'big', 'low']]
[['med', 'low', '3', '4', 'med', 'med'], ['med', 'low', '4', '4', 'med', 'low'], ['low', 'low', '2', '4', 'big', 'med']]
0.3066666666666667
0.27333333333333343
0.04000000000000001
0.10666666666666669
0.3066666666666667
0.15000000000000002
0.29000000000000004


## Recursive Create Tree 

In [6]:
from tree2 import *

car_data = [['med', 'low', '3', '4', 'med', 'med'], ['med', 'vhigh', '4', 'more', 'small', 'high'], ['high', 'med', '3', '2', 'med', 'low'], ['med', 'low', '4', '4', 'med', 'low'], ['med', 'low', '5more', '2', 'big', 'med'], ['med', 'med', '2', 'more', 'big', 'high'], ['med', 'med', '2', 'more', 'med', 'med'], ['vhigh', 'vhigh', '2', '2', 'med', 'low'], ['high', 'med', '4', '2', 'big', 'low'], ['low', 'low', '2', '4', 'big', 'med']]

car_labels = ['acc', 'acc', 'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'unacc', 'unacc', 'good']

def find_best_split(dataset, labels):
    best_gain = 0
    best_feature = 0
    for feature in range(len(dataset[0])):
        data_subsets, label_subsets = split(dataset, labels, feature)
        gain = information_gain(labels, label_subsets)
        if gain > best_gain:
            best_gain, best_feature = gain, feature
    return best_feature, best_gain


best_feature, best_gain = find_best_split(car_data, car_labels)
print(best_feature, best_gain)

def build_tree(data, labels):
    best_feature, best_gain = find_best_split(data, labels)
    if best_gain == 0:
        return Counter(labels)

    data_subsets, label_subsets = split(data, labels, best_feature)
    branches = []
    for i in range(len(data_subsets)):
        result = build_tree(data_subsets[i], label_subsets[i])
        branches.append(result)

    return branches

tree = build_tree(car_data, car_labels)
print_tree(tree)

3 0.3066666666666667
Splitting
--> Branch 0:
  Counter({'unacc': 4})
--> Branch 1:
  Splitting
  --> Branch 0:
    Counter({'good': 1})
  --> Branch 1:
    Counter({'acc': 1})
  --> Branch 2:
    Counter({'unacc': 1})
--> Branch 2:
  Splitting
  --> Branch 0:
    Counter({'vgood': 1})
  --> Branch 1:
    Counter({'acc': 1})
  --> Branch 2:
    Counter({'acc': 1})


### 對新的Data進行分類

In [7]:
from tree3 import *
import operator

test_point = ['vhigh', 'low', '3', '4', 'med', 'med']

# print_tree(tree)
def classify(datapoint, tree):
    if isinstance(tree, Leaf):
        print(tree.labels.items())
        label = max(tree.labels.items(), key=operator.itemgetter(1))[0]
        return label
  
    value = datapoint[tree.feature]
    for branch in tree.branches:
        if branch.value == value:
            return classify(datapoint, branch)

result = classify(test_point, tree)
print(result)

dict_items([('unacc', 1)])
unacc


## scikit-learn

scikit learn 可直接建立DecisionTree

In [8]:
from cars import training_points, training_labels, testing_points, testing_labels
from sklearn.tree import DecisionTreeClassifier

print(training_points[0])
print(training_labels[0])

classifier = DecisionTreeClassifier()

classifier.fit(training_points, training_labels)

score = classifier.score(testing_points, testing_labels)
print(score)

[2.0, 4.0, 4.0, 2.0, 3.0, 3.0]
acc
0.9653179190751445


### Decision Tree限制
#### 1.深度
在每一層選擇分類時，是以**Greddy**方式選擇

都是選Impurity最低的特徵進行分類，但這種選擇最終不一定是最好

有可能在這一層選擇較差的分類結果，但到下一層能得到更好的分類結果

所以要自行測試，選擇出最佳的決策數深度

#### 2.容易Overfitting訓練集

決策數是完全依訓練集進行分類，有可能符合訓練集的分類並不符合實際情況

愈深的決策樹愈容易Overfitting

In [9]:
from cars import training_points, training_labels, testing_points, testing_labels
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(random_state = 0)
classifier.fit(training_points, training_labels)
score = classifier.score(testing_points, testing_labels)
print(f"Depth:{classifier.tree_.max_depth}, {score}")

classifier = DecisionTreeClassifier(max_depth=11, random_state = 0)
classifier.fit(training_points, training_labels)
score = classifier.score(testing_points, testing_labels)
print(f"Depth:{classifier.tree_.max_depth}, {score}")

Depth:12, 0.976878612716763
Depth:11, 0.9826589595375722
