In [9]:
import csv
from math import sqrt
from collections import Counter

In [10]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')

In [11]:
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
# 轉換成字典
df = [dict(zip(columns, row)) for row in data]

In [12]:
X = columns.copy()
X.remove("Id")
X.remove("class")
y = 'class'
X

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

### Equal Width

In [None]:
def equal_width(df,feature,bin_num):
    # 計算每組 bin 區間
    att_value = [row[feature] for row in df]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num
    bins = []
    # 計算每組區間數值範圍
    for i in range(bin_num):
        start = min_value + i * width
        end = start + width
        bins.append((start,end)) 
    print(f'{feature} with equal width discretization => width = {width}')
    for i, (start, end) in enumerate(bins):
        if i == bin_num - 1:
            print(f'bin{i + 1}: {start} <= x <= {end}')
            break
        print(f'bin{i + 1}: {start} <= x < {end}')
    # 切割 attribute value
    discretization = []
    for value in att_value:
        value = float(value)
        for i, (start, end) in enumerate(bins):
            # 判斷 value 位於哪個區間
            if i != bin_num - 1:
                if value >= start and value < end:
                    discretization.append(i + 1)
            else:
                if value >= start and value <= end:
                    discretization.append(i + 1)
    return discretization

RI_split = equal_width(df,'Na',10)

Na with equal width discretization => width = 0.6649999999999998
bin1: 10.73 <= x < 11.395
bin2: 11.395 <= x < 12.059999999999999
bin3: 12.06 <= x < 12.725
bin4: 12.725 <= x < 13.389999999999999
bin5: 13.39 <= x < 14.055
bin6: 14.055 <= x < 14.719999999999999
bin7: 14.719999999999999 <= x < 15.384999999999998
bin8: 15.384999999999998 <= x < 16.049999999999997
bin9: 16.049999999999997 <= x < 16.714999999999996
bin10: 16.715 <= x <= 17.38


[5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 6,
 5,
 4,
 4,
 7,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 3,
 3,
 4,
 5,
 4,
 6,
 6,
 4,
 3,
 4,
 5,
 4,
 5,
 4,
 5,
 4,
 5,
 5,
 4,
 5,
 4,
 4,
 3,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 6,
 5,
 5,
 4,
 4,
 4,
 4,
 7,
 5,
 4,
 4,
 4,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 6,
 4,
 4,
 5,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 3,
 4,
 4,
 3,
 3,
 5,
 5,
 2,
 1,
 3,
 6,
 5,
 1,
 1,
 3,
 5,
 4,
 5,
 4,
 5,
 4,
 5,
 4,
 4,
 4,
 5,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 4,
 4,
 3,
 4,
 6,
 5,
 5,
 4,
 4,
 5,
 5,
 5,
 5,
 4,
 5,
 6,
 5,
 4,
 2,
 1,
 3,
 4,
 4,
 5,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 6,
 6,
 6,
 7,
 6,
 6,
 10,
 5,
 6,
 5,
 7,
 8,
 5,
 7,
 6,
 7,
 6,
 6,
 5,
 6,
 6,
 7,
 7,
 2,
 7,
 7,
 7,
 7,
 7,
 6,
 6,
 6,
 7,
 6,
 6,
 6]

### Equal Frequency

In [30]:
def equal_frequency(df,feature,bin_num):
    # 計算每個區間應包含的 value 個數
    frequency = len(df) // bin_num
    # 先記錄每個 instance 的原始索引及 value
    att_value = [(i, float(row[feature])) for i, row in enumerate(df)]

    # 依照 value 值排序
    att_value.sort(key = lambda x : x[1])
    bins = []
    start = att_value[0][1]
    # 字典儲存原始索引分配到的區間
    bin_assign = {}
    bin_index = 1    # 記錄目前 bin 
    cur_bin_cnt = 0  # 記錄目前 bin 所分配到的 value 個數
    for i,(org_index,value) in enumerate(att_value):
        
        # 目前 bin 的 value 數量超過一個 bin 所應該分配到的 frequency
        if cur_bin_cnt >= frequency and bin_index <= bin_num:
            # 且當前 value 不等於前一個 value 值
            if i < len(att_value) and value != att_value[i - 1][1]:
                end = value
                bins.append((start,end))
                # 切換到下一個 bin
                bin_index += 1
                cur_bin_cnt = 0
                start = att_value[i][1]

        # 因為先前有排序過，故可以直接照順序分配 bin_index，但須指派給原始 index
        bin_assign[org_index] = bin_index
        cur_bin_cnt += 1
    
    return bins

equal_frequency(df,'RI',10)
 

[(1.51115, 1.51592),
 (1.51592, 1.51631),
 (1.51631, 1.5167),
 (1.5167, 1.51735),
 (1.51735, 1.51768),
 (1.51768, 1.51811),
 (1.51811, 1.5186),
 (1.5186, 1.51994),
 (1.51994, 1.52196),
 (1.52196, 1.52777)]