# 手算ID3决策树

In [37]:
import numpy as np
import pandas as pd
import xlwings as xw

In [3]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,age,income,student,credit_rating,label
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_age,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_age,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


In [49]:
wb = xw.Book('决策树.xlsx')
sht = wb.sheets[0]
sht.range('a1').value = '数据'
sht.range('a2').options(index=False).value = data

## y的信息量

### 定义信息量函数 

In [35]:
def information(p):    
    p_mask = np.ma.masked_values(p, 0)
    I_mask = - p_mask * (np.ma.log2(p_mask))
    I  = I_mask.filled()
    return  I

### y的类别数

In [36]:
y = data['label']
y_label_num = len(np.unique(y))
y_label_num

2

### 计算各类别占比

In [62]:
y_1 = y[y == np.unique(data['label'])[0]]
y_2 = y[y == np.unique(data['label'])[1]]

p_y1 = len(y_1) / len(y)
p_y2 = len(y_2) / len(y)

print('p_y1:' , p_y1)
print('p_y2:' , p_y2)

p_y1: 0.35714285714285715
p_y2: 0.6428571428571429


### 计算y的信息量

In [67]:
I_y= information(np.array([p_y1])) + information(np.array([p_y2]))
print(I_y)

array([ 0.94028596])

## 计算信息增益

In [68]:
data_2 = data[['age','label']].sort_values(by='age')

### 计算每个类别的占比

In [70]:
data_2_retio = pd.DataFrame(np.unique(data['age']))
from statsmodels.tools import categorical
data_2_retio['num'] = np.sum(categorical(data['age'].values, drop=True), axis=0)
data_2_retio['retiao'] = np.sum(categorical(data['age'].values, drop=True), axis=0) / len(data)

for y_value in np.unique(y):
    data_2_retio[y_value] = [len(data.loc[data.age==x_value].loc[data.label==y_value])/len(data.loc[data.age==x_value]) for x_value in np.unique(data['age']) ] 
    
print(data_2_retio)

            0  num    retiao   no  yes
0  middle_age  4.0  0.285714  0.0  1.0
1      senior  5.0  0.357143  0.4  0.6
2       youth  5.0  0.357143  0.6  0.4


In [71]:
from statsmodels.tools import categorical
data_2 = {}
for column in data.columns[:-1]:
    data_2_df = pd.DataFrame(np.unique(data[column]))
    
    data_2_df['num'] = np.sum(categorical(data[column].values, drop=True), axis=0)
    data_2_df['retiao'] = np.sum(categorical(data[column].values, drop=True), axis=0) / len(data)

    for y_value in np.unique(y):
        data_2_df[y_value] = [len(data.loc[data[column]==x_value].loc[data.label==y_value])/len(data.loc[data[column]==x_value]) for x_value in np.unique(data[column]) ] 
    
        data_2_df['I'+'_'+y_value] = information(data_2_df[y_value].values)
    
    data_2_df['I'] = data_2_df['I_yes'] + data_2_df['I_no'] 
       
    print(column + ':')
    print(data_2_df)
    print('\n\n')
    data_2[column] = data_2_df

age:
            0  num    retiao   no      I_no  yes     I_yes         I
0  middle_age  4.0  0.285714  0.0  0.000000  1.0 -0.000000  0.000000
1      senior  5.0  0.357143  0.4  0.528771  0.6  0.442179  0.970951
2       youth  5.0  0.357143  0.6  0.442179  0.4  0.528771  0.970951



income:
        0  num    retiao        no      I_no       yes     I_yes         I
0    high  4.0  0.285714  0.500000  0.500000  0.500000  0.500000  1.000000
1     low  4.0  0.285714  0.250000  0.500000  0.750000  0.311278  0.811278
2  medium  6.0  0.428571  0.333333  0.528321  0.666667  0.389975  0.918296



student:
     0  num  retiao        no      I_no       yes     I_yes         I
0   no  7.0     0.5  0.571429  0.461346  0.428571  0.523882  0.985228
1  yes  7.0     0.5  0.142857  0.401051  0.857143  0.190622  0.591673



credit_rating:
           0  num    retiao    no  I_no   yes     I_yes         I
0  excellent  6.0  0.428571  0.50   0.5  0.50  0.500000  1.000000
1       fair  8.0  0.571429  0.25   

In [72]:
index = 1
for column in data.columns[:-1]:    
    sht['g'+str(index)].value = column
    index += 1

    sht['g'+str(index)].options(index=False).value = data_2[column]

    index +=len(data_2[column]) +2

### 计算信息增益

In [73]:
for column in data.columns[:-1]:
    entropy = np.sum(data_2[column]['I'] * data_2[column]['retiao'])
    gain = I_y -entropy
    print(column , '的信息增益值为', gain, '\n')

age 的信息增益值为 [ 0.24674982] 

income 的信息增益值为 [ 0.02922257] 

student 的信息增益值为 [ 0.1518355] 

credit_rating 的信息增益值为 [ 0.04812703] 



In [74]:
data_2_gain = pd.DataFrame([I_y - np.sum(data_2[column]['I'] * data_2[column]['retiao']) for column in data.columns[:-1]], index=data.columns[:-1], columns=['Gain'])
data_2_gain

Unnamed: 0,Gain
age,0.24675
income,0.029223
student,0.151836
credit_rating,0.048127


In [76]:
sht['a20'].options(index=True).value = data_2_gain

可以看出age的信息增益最大，所以按age划分

In [83]:
sht_2 = wb.sheets['age_split']

index = 1
data_3 = dict(list(data.groupby(by='age')))
for key, value in data_3.items():
    sht_2['a'+str(index)].value = key
    index += 1
    sht_2['a'+str(index)].options(index=False).value=value
    index += len(key) + 2
    
    print(key)
    print(value)
    print()

middle_age
           age  income student credit_rating label
2   middle_age    high      no          fair   yes
6   middle_age     low     yes     excellent   yes
11  middle_age  medium      no     excellent   yes
12  middle_age    high     yes          fair   yes

senior
       age  income student credit_rating label
3   senior  medium      no          fair   yes
4   senior     low     yes          fair   yes
5   senior     low     yes     excellent    no
9   senior  medium     yes          fair   yes
13  senior  medium      no     excellent    no

youth
      age  income student credit_rating label
0   youth    high      no          fair    no
1   youth    high      no     excellent    no
7   youth  medium      no          fair    no
8   youth     low     yes          fair   yes
10  youth  medium     yes     excellent   yes



In [125]:
sht = wb.sheets('youth')
data3 = sht.range('a1').options(pd.DataFrame, expand='table', index=False).value
data3

Unnamed: 0,income,student,credit_rating,label
0,high,no,fair,no
1,high,no,excellent,no
2,medium,no,fair,no
3,low,yes,fair,yes
4,medium,yes,excellent,yes


In [126]:
data = data3

In [127]:
y_1 = y[y == np.unique(data['label'])[0]]
y_2 = y[y == np.unique(data['label'])[1]]

p_y1 = len(y_1) / len(y)
p_y2 = len(y_2) / len(y)

print('p_y1:' , p_y1)
print('p_y2:' , p_y2)

I_y= information(np.array([p_y1])) + information(np.array([p_y2]))
print(I_y)

p_y1: 0.35714285714285715
p_y2: 0.6428571428571429
[ 0.94028596]


In [128]:
sht.range('a8').value = np.array([['p_y1:' , p_y1],['p_y2:' , p_y2], ['gain:', I_y[0]]])

In [129]:
data_2 = {}
for column in data.columns[:-1]:
    data_2_df = pd.DataFrame(np.unique(data[column]))
    
    data_2_df['num'] = np.sum(categorical(data[column].values, drop=True), axis=0)
    data_2_df['retiao'] = np.sum(categorical(data[column].values, drop=True), axis=0) / len(data)

    for y_value in np.unique(y):
        data_2_df[y_value] = [len(data.loc[data[column]==x_value].loc[data.label==y_value])/len(data.loc[data[column]==x_value]) for x_value in np.unique(data[column]) ] 
    
        data_2_df['I'+'_'+y_value] = information(data_2_df[y_value].values)
    
    data_2_df['I'] = data_2_df['I_yes'] + data_2_df['I_no'] 
       
    print(column + ':')
    print(data_2_df)
    print('\n\n')
    data_2[column] = data_2_df

income:
        0  num  retiao   no  I_no  yes  I_yes    I
0    high  2.0     0.4  1.0  -0.0  0.0    0.0  0.0
1     low  1.0     0.2  0.0   0.0  1.0   -0.0  0.0
2  medium  2.0     0.4  0.5   0.5  0.5    0.5  1.0



student:
     0  num  retiao   no  I_no  yes  I_yes    I
0   no  3.0     0.6  1.0  -0.0  0.0    0.0  0.0
1  yes  2.0     0.4  0.0   0.0  1.0   -0.0  0.0



credit_rating:
           0  num  retiao        no      I_no       yes     I_yes         I
0  excellent  2.0     0.4  0.500000  0.500000  0.500000  0.500000  1.000000
1       fair  3.0     0.6  0.666667  0.389975  0.333333  0.528321  0.918296





In [130]:
index = 1
for column in data.columns[:-1]:    
    sht['f'+str(index)].value = column
    index += 1

    sht['g'+str(index)].options(index=False).value = data_2[column]

    index +=len(data_2[column]) +2

In [131]:
data_2_gain = pd.DataFrame([I_y - np.sum(data_2[column]['I'] * data_2[column]['retiao']) for column in data.columns[:-1]], index=data.columns[:-1], columns=['Gain'])
data_2_gain

Unnamed: 0,Gain
income,0.540286
student,0.940286
credit_rating,-0.010692


In [132]:
sht['a12'].options(index=True).value = data_2_gain