In [87]:
import csv
from math import sqrt
from math import log2
from math import log
from collections import Counter

In [88]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')

In [89]:
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
# 轉換成字典
df = [dict(zip(columns, row)) for row in data]

In [90]:
X = columns.copy()
X.remove("Id")
X.remove("class")
y = 'class'
X

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

### Equal Width

In [122]:
def equal_width(df,feature,bin_num):
    # 計算每組 bin 區間
    att_value = [row[feature] for row in df]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num
    bins = []
    # 計算每組區間數值範圍
    for i in range(bin_num):
        start = min_value + i * width
        end = start + width
        bins.append((start,end))  
    print(f'{feature} with equal width discretization => width = {width}')
    for i, (start, end) in enumerate(bins):
        if i == bin_num - 1:
            print(f'bin{i + 1}: {start} <= x <= {end}')
            break
        print(f'bin{i + 1}: {start} <= x < {end}')
    # 切割 attribute value
    discretization = []
    for value in att_value:
        value = float(value)
        for i, (start, end) in enumerate(bins):
            # 判斷 value 位於哪個區間
            if i != bin_num - 1:
                if value >= start and value < end:
                    discretization.append(i + 1)
            else:
                if value >= start and value <= end:
                    discretization.append(i + 1)
    return discretization

RI_split = equal_width(df,'RI',10)

print("The split results for the first five RI attribute values:")
for i, value in enumerate(RI_split):
    if i > 5:
        break
    print(f"{value}")

RI with equal width discretization => width = 0.0022780000000000022
bin1: 1.51115 <= x < 1.513428
bin2: 1.513428 <= x < 1.515706
bin3: 1.515706 <= x < 1.517984
bin4: 1.517984 <= x < 1.520262
bin5: 1.520262 <= x < 1.52254
bin6: 1.52254 <= x < 1.524818
bin7: 1.524818 <= x < 1.527096
bin8: 1.527096 <= x < 1.529374
bin9: 1.529374 <= x < 1.531652
bin10: 1.531652 <= x <= 1.53393
The split results for the first five RI attribute values:
5
3
3
3
3
3


### Equal Frequency

In [123]:
def equal_frequency(df,feature,bin_num):
    # 計算每個區間應包含的 value 個數
    frequency = len(df) // bin_num
    # 先記錄每個 instance 的原始索引及 value
    att_value = [(i, float(row[feature])) for i, row in enumerate(df)]
    # 依照 value 值排序
    att_value.sort(key = lambda x : x[1])

    # 字典儲存原始索引分配到的區間
    bin_assign = {}
    bin_index = 1    # 記錄目前 bin 
    cur_bin_cnt = 0  # 記錄目前 bin 所分配到的 value 個數
    for i,(org_index,value) in enumerate(att_value):
        # 目前 bin 的 value 數量超過一個 bin 所應該分配到的 frequency
        if cur_bin_cnt >= frequency and bin_index <= bin_num:
            # 且當前 value 不等於前一個 value 值
            if i < len(att_value) and value != att_value[i - 1][1]:
                # 切換到下一個 bin
                bin_index += 1
                cur_bin_cnt = 0
        # 因為先前有排序過，故可以直接照順序分配 bin_index，但須指派給原始 index
        bin_assign[org_index] = bin_index
        cur_bin_cnt += 1
    return bin_assign

equal_frequency(df,'Na',10)

 

{106: 1,
 111: 1,
 166: 1,
 110: 1,
 105: 1,
 165: 1,
 201: 1,
 149: 1,
 97: 1,
 107: 1,
 101: 1,
 55: 1,
 89: 1,
 28: 1,
 33: 1,
 14: 1,
 102: 1,
 112: 1,
 167: 1,
 30: 1,
 98: 1,
 16: 2,
 34: 2,
 41: 2,
 94: 2,
 10: 2,
 44: 2,
 164: 2,
 37: 2,
 100: 2,
 22: 2,
 5: 2,
 40: 2,
 138: 2,
 11: 2,
 15: 2,
 23: 2,
 20: 2,
 79: 2,
 31: 2,
 32: 2,
 142: 2,
 145: 2,
 174: 2,
 13: 3,
 80: 3,
 154: 3,
 168: 3,
 27: 3,
 57: 3,
 139: 3,
 12: 3,
 137: 3,
 91: 3,
 90: 3,
 121: 3,
 125: 3,
 126: 3,
 99: 3,
 175: 3,
 25: 3,
 56: 3,
 144: 3,
 9: 3,
 77: 3,
 136: 3,
 143: 3,
 172: 3,
 88: 4,
 19: 4,
 74: 4,
 75: 4,
 96: 4,
 171: 4,
 155: 4,
 66: 4,
 67: 4,
 29: 4,
 72: 4,
 83: 4,
 114: 4,
 68: 4,
 92: 4,
 53: 4,
 150: 4,
 7: 4,
 46: 4,
 135: 4,
 51: 4,
 124: 4,
 141: 4,
 3: 5,
 26: 5,
 42: 5,
 48: 5,
 54: 5,
 122: 5,
 86: 5,
 93: 5,
 116: 5,
 148: 5,
 81: 5,
 120: 5,
 4: 5,
 169: 5,
 35: 5,
 6: 5,
 118: 5,
 69: 5,
 134: 5,
 140: 5,
 147: 5,
 160: 5,
 73: 6,
 85: 6,
 95: 6,
 24: 6,
 173: 6,
 59: 6,
 87: 