# Decision Tree exercise
You should use decision tree to classify. 

Design your DecisionTree. Do binary classification or multiclass classification (selected by yourself)

In [190]:
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import sklearn

%matplotlib inline

## Kaggle in-class Competetion
请先前往 Kaggle 下载本次比赛的数据集

比赛页面：https://inclass.kaggle.com/c/hdu-cama/leaderboard

本次比赛可使用的 Package: Pandas, Numpy 以及系统内置库如 math 等

完成下面代码后，使用 predict 函数对 test.csv 中的数据做出预测并将结果保存至一个 .csv 文件，然后 submit 至 Kaggle，可参考示例文件 sample.csv

__请务必仔细阅读 Kaggle 页面的各项信息__

__请务必仔细阅读 Kaggle 页面的各项信息__

__请务必仔细阅读 Kaggle 页面的各项信息__

## 请务必仔细阅读文件 “ID3 Algorithm for Decision Trees.pdf”
## 请务必仔细阅读文件 “ID3 Algorithm for Decision Trees.pdf”
## 请务必仔细阅读文件 “ID3 Algorithm for Decision Trees.pdf”
### Calculate Shannon Entropy

熵是对不确定性的测量，熵越高，代表信息量越高，这里你需要使用熵来选择作为节点的特征。（选择能够最小化两边熵的特征）

$$Entropy(S) = - P_+ \log_2{P_+} - P_- \log_2{P_-}$$

### Calculate Information Gain
$$Gain(S, A) = Entropy(S) - \sum_{v\in Values(A)}{\frac{|S_v|}{|S|}Entropy(S_v)}$$

In [191]:
# Read data from train.csv and y_train.csv
train_feature = pd.read_csv('train.csv')
train_label = pd.read_csv('y_train.csv')

train_dataSet = pd.merge(train_feature, train_label, on = 'ID')

featureNames = train_dataSet.columns.tolist()[1:10]
featureNames_copy = train_dataSet.columns.tolist()[1:10]

dataSet_inArray = train_dataSet.values
dataSet_inArray_noSerial = dataSet_inArray[:,1:]
dataSet_totalSplit = dataSet_inArray_noSerial.tolist()

In [192]:
def calculateShannonEntropy(dataSet):
    # Todo 1: calculate the entropy given a dataset
   
    numEntries = len(dataSet) # There are n rows inside
    labelCounts = {} # Create dictionary for classification

    for featureVector in dataSet:
        
    	currentLabel = featureVector[-1] # Get the last-row data
    	if currentLabel not in labelCounts.keys():
    		labelCounts[currentLabel] = 0
    	labelCounts[currentLabel] += 1

    total_entropy = 0.0
    for key in labelCounts:
    	proportion_k = float(labelCounts[key]) / numEntries
    	total_entropy -= (proportion_k * math.log(proportion_k, 2))

    return total_entropy

In [193]:
# TODO 1 Test:
calculateShannonEntropy(dataSet_totalSplit)

0.9278532379384186

In [194]:
def choose_best_feature_to_split(dataSet):
    # Todo 2: return the best feature based on the maximum number of information gain
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calculateShannonEntropy(dataSet)
    bestInfoGain = 0
    best_feature = -1

    for i in range(numFeatures):
    	featureList = [number[i] for number in dataSet] # enum for one attribute
    	uniqualValues = set(featureList) # no-relace attribute
    	newEntropy = 0

    	for value in uniqualValues:
    		sub_dataset = split_dataset(dataSet, i, value)
    		proportion_k = len(sub_dataset) / float(len(dataSet))
    		newEntropy += proportion_k * calculateShannonEntropy(sub_dataset) # sum(ShannonEntropy)
    	infoGain = baseEntropy - newEntropy # infoGain

    	# bestInfoGain
    	if (infoGain > bestInfoGain):
    		bestInfoGain = infoGain
    		best_feature = i

    return best_feature

In [195]:
# TODO 2 Test:
choose_best_feature_to_split(dataSet_totalSplit)

4

In [196]:
def split_dataset(dataSet, axis, value):
    # Todo 3: Split the dataset via current selected feature and it's value
    # For example, when current_feature is TLS(top-left-square), and the value is 'o', 
    # the task is that return the subdataset in which all "TLS" is equal to 'o'
    sub_dataset = []

    for featureVector in dataSet:
    	if featureVector[axis] == value:
    		reduceFeatureVector = featureVector[ :axis]
    		reduceFeatureVector.extend(featureVector[axis+1: ])  
    		sub_dataset.append(reduceFeatureVector)

    return sub_dataset

In [197]:
import operator
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():classCount[vote] = 0
        classCount[vote]+=1
    sortedClassCount=sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def create_decision_tree(dataSet, featureNames):
    # Todo 4: Create a decision tree by recursion
    #
    # Tips: Set appropriate boundary conditions; 
    #       think about the values one by one; 
    #       Use the three functions defined before.
    
    classList = [example[-1] for example in dataSet]
    #类别相同，停止划分
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    #长度为1，返回出现次数最多的类别
    if len(classList[0]) == 1:
        return majorityCnt(classList)

    best_feature = choose_best_feature_to_split(dataSet) #返回分类的特征序号
    bestFeatureName = featureNames[best_feature] #该特征的label
    decision_tree = {bestFeatureName: { } }
    del(featureNames[best_feature]) #从labels的list中删除该label
    
    featureValues = [example[best_feature] for example in dataSet]
    uniqualValues = set(featureValues)
    for value in uniqualValues:
    	subFeatureNames = featureNames[ : ] #子集合

    	#构建数据的子集合，并进行递归
    	decision_tree[bestFeatureName][value] = create_decision_tree(split_dataset(dataSet, best_feature, value), subFeatureNames)
    
    return decision_tree

In [198]:
# TODO 3&4 Test:
myDecisionTree = create_decision_tree(dataSet_totalSplit, featureNames)
myDecisionTree

{'MMS': {'b': {'BLS': {'b': {'TRS': {'o': 'negative', 'x': 'positive'}},
    'o': {'TRS': {'b': 'negative',
      'o': 'negative',
      'x': {'TLS': {'b': {'MLS': {'o': 'positive', 'x': 'negative'}},
        'o': {'TMS': {'b': 'negative', 'o': 'positive', 'x': 'negative'}},
        'x': {'TMS': {'o': 'negative', 'x': 'positive'}}}}}},
    'x': {'TRS': {'b': 'positive',
      'o': {'TMS': {'b': 'positive',
        'o': {'TLS': {'b': 'positive', 'o': 'negative', 'x': 'positive'}},
        'x': {'BRS': {'b': 'positive',
          'o': {'MRS': {'b': 'positive', 'o': 'negative'}},
          'x': 'positive'}}}},
      'x': 'positive'}}}},
  'o': {'BRS': {'b': {'TLS': {'b': 'negative',
      'o': 'negative',
      'x': {'MLS': {'b': {'TRS': {'b': 'negative',
          'o': 'negative',
          'x': {'TMS': {'o': 'negative', 'x': 'positive'}}}},
        'o': {'MRS': {'b': 'positive',
          'o': 'negative',
          'x': {'BLS': {'b': 'positive',
            'o': {'TRS': {'o': 'negative'

In [199]:
# Todo 5
# Func: classify

def classify(inputTree, featureNames, testVector):
    classLabel = []
    firstStr = inputTree.keys()[0] #获取树的第一个特征属性
    secondDict = inputTree[firstStr] #树的分支，子集合Dict
    featureIndex = featureNames.index(firstStr) #获取决策树第一层在featLables中的位置
    for key in secondDict.keys():
        if testVector[featureIndex] == key:
            if type(secondDict[key]).__name__ == 'dict':
                classLabel = classify(secondDict[key], featureNames, testVector)
            else:
            	classLabel = secondDict[key]
    
    return classLabel


In [200]:
# TODO 5 Test
# Func: classify Test:
classLabel = classify(myDecisionTree, featureNames_copy, ['o', 'x', 'o', 'b', 'o', 'x', 'o', 'x', 'x'] )
classLabel

'negative'

In [201]:
# Get test_dataset from test.csv
test_feature = pd.read_csv('test.csv')

#test_dataSet = test_feature.columns.tolist()[1:10]

#tset_dataSet_inArray = test_dataset.values
#test_dataSet_inArray_noSerial = tset_dataSet_inArray[:,1:]
#tset_dataSet_totalSplit = dataSet_inArray_noSerial.tolist()
test_dataSet_totalSplit = test_feature.values[:,1:]


# Todo 5

def predict(myDecisionTree, featureNames, test_dataset):

    #print test
    myCount = 0
    myClassLabels = []
    ID = []
    for feature in test_dataSet_totalSplit:
        currentClassLabel = classify(myDecisionTree, featureNames, feature)
        print myCount
        print currentClassLabel
    
        myClassLabels.append(currentClassLabel)
        ID.append(myCount)
        myCount +=1
        
    return myClassLabels

In [207]:
# TODO 5 Test:

myPredictions = predict(myDecisionTree, featureNames_copy, test_dataSet_totalSplit)


0
positive
1
negative
2
positive
3
positive
4
negative
5
negative
6
positive
7
positive
8
positive
9
negative
10
negative
11
positive
12
negative
13
negative
14
negative
15
positive
16
positive
17
negative
18
negative
19
positive
20
negative
21
positive
22
negative
23
positive
24
positive
25
positive
26
negative
27
[]
28
negative
29
positive
30
positive
31
positive
32
positive
33
negative
34
positive
35
positive
36
positive
37
positive
38
positive
39
negative
40
negative
41
positive
42
positive
43
negative
44
positive
45
positive
46
negative
47
negative
48
positive
49
positive
50
positive
51
negative
52
negative
53
positive
54
positive
55
positive
56
positive
57
positive
58
positive
59
positive
60
positive
61
positive
62
positive
63
positive
64
positive
65
positive
66
positive
67
positive
68
positive
69
[]
70
negative
71
negative
72
positive
73
negative
74
positive
75
positive
76
positive
77
positive
78
negative
79
positive
80
negative
81
positive
82
[]
83
negative
84
negative
85
posit

### 优化（可选）
在上面这些步骤完成后，你可以优化 create_decision_tree 函数以防止过拟合

- 对决策树进行剪枝
- 也推荐两个更简单又十分有效的办法
    - 设置树的最大深度 max_depth
    - 设置每个叶节点的最小 samples 数
    - 这里可以参考 [decision tree in scikit-learn](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier) 中的参数设置以及其原理

In [208]:
# mySubmit = pd.read_csv('my_submit.csv')

# column_ID = pd.Series(ID, name='ID') 
# column_Category = pd.Series(classLabels, name = 'Category')
# predictions = pd.concat([column_ID, column_Category], axis = 1)

# predictions.to_csv('my_submit.csv', index = False)


# 得到最终答案 y_pred, 是一个 1维的array
# 存储为要求格式的文件

df = pd.DataFrame(np.stack( (range(len(myPredictions)), myPredictions) ).T) 
df.to_csv('result.csv', index = None, header=['ID', 'Category'])

## 评估

下面的数据可在你的 predict 文件提交至 Kaggle 后获得。

- Kaggle 昵称：
- 模型目前 Public Leaderboard 得分：
- 排名：

### 反思
请对你的模型进行一定的分析，说出你模型的不足之处，或者可以提高的地方。

回答：

In [204]:
# To improve the Model:
import sklearn

train_feature = pd.read_csv('train.csv')
train_label = pd.read_csv('y_train.csv')

train_feature_inArray = train_feature.values
train_feature_inArray_noSerial = train_feature_inArray[:,1:]
train_feature_totalSplit = train_feature_inArray_noSerial.tolist()

train_label_inArray = train_label.values

test_feature = pd.read_csv('test.csv')
test_feature_inArray = test_feature.values
test_feature_inArray_noSerial = test_feature_inArray[:,1:]
test_feature_totalSplit = test_feature_inArray_noSerial.tolist()

In [205]:
for i in range(len(train_feature_totalSplit)):
    for j in range(len(train_feature_totalSplit[i]))
        if train_feature_totalSplit[i][j] == 'x':
            train_feature_totalSplit[i][j] = 2
        elif train_feature_totalSplit[i][j] == 'o':
            train_feature_totalSplit[i][j] = 0
        else:
            train_feature_totalSplit[i][j] = 1
            
for i in range(len(test_feature_totalSplit)):
    if test_feature_totalSplit[i] == 'x':
        test_feature_totalSplit[i] = 2
    elif i == 'o':
        test_feature_totalSplit[i] = 0
    else:
        test_feature_totalSplit[i] = 1
        
train_feature_totalSplit

SyntaxError: invalid syntax (<ipython-input-205-bdbcf92026a0>, line 2)

In [None]:
myPredictModel = sklearn.neighbors.KNeighborsClassifier()
myPredictModel.fit(train_feature, train_label)

print myPredictionModel


# 输出测试效果
# print metrics.classification_report(expected, predicted)
# print metrics.confusion_matrix(expected, predicted)