In [45]:
import csv
from math import sqrt
from math import log2
from collections import Counter

In [4]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')

In [5]:
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
# 轉換成字典
df = [dict(zip(columns, row)) for row in data]

In [6]:
X = columns.copy()
X.remove("Id")
X.remove("class")
y = 'class'
X

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

### Equal Width

In [None]:
def equal_width(df,feature,bin_num):
    # 計算每組 bin 區間
    att_value = [row[feature] for row in df]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num

    bins = []  # 儲存每組區間範圍
    # 計算每組區間數值範圍
    for i in range(bin_num):
        start = min_value + i * width
        end = start + width
        bins.append((start,end)) 
    print(f'{feature} with equal width discretization => width = {width}')
    for i, (start, end) in enumerate(bins):
        if i == bin_num - 1:
            print(f'bin{i + 1}: {start} < x <= {end}')
            break
        if i == 0:
            print(f'bin{i + 1}: {start} <= x <= {end}')
        else:
            print(f'bin{i + 1}: {start} < x <= {end}')
    print("=========================================================")
    return bins

# 針對每個 attribute 做 equal width 離散化
for feature in X:
    equal_width(df,feature,10)


RI with equal width discretization => width = 0.0022780000000000022
bin1: 1.51115 <= x <= 1.513428
bin2: 1.513428 < x <= 1.515706
bin3: 1.515706 < x <= 1.517984
bin4: 1.517984 < x <= 1.520262
bin5: 1.520262 < x <= 1.52254
bin6: 1.52254 < x <= 1.524818
bin7: 1.524818 < x <= 1.527096
bin8: 1.527096 < x <= 1.529374
bin9: 1.529374 < x <= 1.531652
bin10: 1.531652 < x <= 1.53393
Na with equal width discretization => width = 0.6649999999999998
bin1: 10.73 <= x <= 11.395
bin2: 11.395 < x <= 12.059999999999999
bin3: 12.06 < x <= 12.725
bin4: 12.725 < x <= 13.389999999999999
bin5: 13.39 < x <= 14.055
bin6: 14.055 < x <= 14.719999999999999
bin7: 14.719999999999999 < x <= 15.384999999999998
bin8: 15.384999999999998 < x <= 16.049999999999997
bin9: 16.049999999999997 < x <= 16.714999999999996
bin10: 16.715 < x <= 17.38
Mg with equal width discretization => width = 0.449
bin1: 0.0 <= x <= 0.449
bin2: 0.449 < x <= 0.898
bin3: 0.898 < x <= 1.347
bin4: 1.347 < x <= 1.796
bin5: 1.796 < x <= 2.245
bin6: 2

### Equal Frequency

In [60]:
def equal_frequency(df,feature,bin_num):
    # 計算每個區間應包含的 instances 個數
    frequency = len(df) // bin_num
    # 先記錄每個 instance 的原始索引及 value
    att_value = [(i, float(row[feature])) for i, row in enumerate(df)]
    # 依照 value 值排序
    att_value.sort(key = lambda x : x[1])

    bins = [] # 儲存每個 bin 範圍
    start = att_value[0][1]
    bin_index = 1    # 記錄目前 bin 
    cur_bin_cnt = 0  # 記錄目前 bin 所分配到的 value 個數
    # att_value 已排序過，故會由小到大遍歷
    for i,(org_index,value) in enumerate(att_value):
        # 目前 bin 的 value 數量超過一個 bin 所應該分配到的 frequency
        if cur_bin_cnt >= frequency and bin_index <= bin_num:
            # 且當前 value 不等於前一個 value 值
            if i < len(att_value) and value != att_value[i - 1][1]:
                if bin_index == 10:
                    bins.append((start,att_value[-1][1]))
                    break
                end = value  # 該 bin 的區間最大值
                bins.append((start,end))

                # 切換到下一個 bin
                bin_index += 1
                cur_bin_cnt = 0
                start = att_value[i][1]  # att_value[i][1] 為下個 bin 的起點
        # 該 bin 裡的 instances 個數加一
        cur_bin_cnt += 1

    # 上述設定在切換下個 bin 時才將 (start,end) 進 bins
    # 有可能迴圈結束，最後一個 bin 的值個數不足一個 frequency，不會切換 bin，因此需要額外判斷防止最後一個 bin 消失
    if len(bins) < bin_num:
        end = att_value[-1][1]   # end 為最後一個元素(最大值)
        bins.append((start, end))
    print(f'{feature} with equal frequency discretization:')
    for i, (start, end) in enumerate(bins):
        if i == bin_num - 1:
            print(f'bin{i + 1}: {start} < x <= {end}')
            break
        if i == 0:
            print(f'bin{i + 1}: {start} <= x <= {end}')
        else:
            print(f'bin{i + 1}: {start} < x <= {end}')
    print("=========================================================")
    return bins

# 針對每個 attribute 做 equal width 離散化
for feature in X:
    equal_frequency(df,feature,10)

RI with equal frequency discretization:
bin1: 1.51115 <= x <= 1.51592
bin2: 1.51592 < x <= 1.51631
bin3: 1.51631 < x <= 1.5167
bin4: 1.5167 < x <= 1.51735
bin5: 1.51735 < x <= 1.51768
bin6: 1.51768 < x <= 1.51811
bin7: 1.51811 < x <= 1.5186
bin8: 1.5186 < x <= 1.51994
bin9: 1.51994 < x <= 1.52196
bin10: 1.52196 < x <= 1.53393
Na with equal frequency discretization:
bin1: 10.73 <= x <= 12.68
bin2: 12.68 < x <= 12.86
bin3: 12.86 < x <= 13.01
bin4: 13.01 < x <= 13.21
bin5: 13.21 < x <= 13.34
bin6: 13.34 < x <= 13.5
bin7: 13.5 < x <= 13.75
bin8: 13.75 < x <= 14.15
bin9: 14.15 < x <= 14.7
bin10: 14.7 < x <= 17.38
Mg with equal frequency discretization:
bin1: 0.0 <= x <= 0.33
bin2: 0.33 < x <= 2.76
bin3: 2.76 < x <= 3.37
bin4: 3.37 < x <= 3.49
bin5: 3.49 < x <= 3.55
bin6: 3.55 < x <= 3.6
bin7: 3.6 < x <= 3.67
bin8: 3.67 < x <= 3.83
bin9: 3.83 < x <= 4.49
Al with equal frequency discretization:
bin1: 0.29 <= x <= 0.87
bin2: 0.87 < x <= 1.15
bin3: 1.15 < x <= 1.24
bin4: 1.24 < x <= 1.31
bin5: 

### Entropy Based

In [73]:
def entropy(df,class_label):
    class_value = [row[class_label] for row in df]
    counter = Counter(class_value)
    prob = [count / len(df) for i,count in counter.items()]
    entropy = -sum(p * log2(p) for p in prob) 
    return entropy

# cut_index 為資料要切成兩半時，「右側區間的起始 index」，此函式計算切割成左右 subset 後的資訊增益
def info_gain(df,cut_point,class_label):
    class_value = [row[class_label] for row in df]
    total = len(class_value)
    left = df[:cut_point]
    right = df[cut_point:]
    Ent_T = len(left) / total * entropy(left,class_label) + len(right) / total * entropy(right,class_label)

    info_gain = entropy(df,class_label) - Ent_T
    return info_gain


def find_cut_point(df, feature, class_label):
    best_info_gain = -1
    best_cut_index = -1
    best_cut_value = None
    # 針對選定的 attribute 依照 attribute value 由小到大排序
    sorted_df = sorted(df, key = lambda row: float(row[feature]))
    
    for i in range(len(sorted_df)):
        if sorted_df[i][class_label] != sorted_df[i - 1][class_label]:
            cut_value = (float(sorted_df[i][class_label]) + float(sorted_df[i - 1][class_label])) / 2
            cur_gain = info_gain(df,i,class_label)
            if cur_gain > best_info_gain:
                best_info_gain = cur_gain
                best_cut_index = i
                best_cut_value = cut_value
    return best_info_gain, best_cut_index, best_cut_value

find_cut_point(df,"Ba",'class')



(0.8132111061444658, 67, 1.5)