In [216]:
import csv
from math import sqrt
from math import log2
from collections import Counter

In [217]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')

In [218]:
# 將原始 list of list 型態的 data 轉換為 dict of list
def table_to_column_dict(data, columns, convert_numeric = True):
    # 初始化一個空的 dictionary，key 是 attributes 名稱，value 是空 list：
    df_dict = {col: [] for col in columns}
    # 對每一列資料逐欄掃描，同時把欄位名稱 (col) 跟對應值 (val) 配對起來
    for row in data:
        for col, val in zip(columns, row):
            if convert_numeric:
                try:
                    val = float(val)
                except ValueError:
                    pass  # 若轉不了 float，就保持原樣（例如 Id）
            df_dict[col].append(val)
    return df_dict
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
df = table_to_column_dict(data,columns)

In [219]:
X = columns.copy()
X.remove("Id")
X.remove("class")
y = 'class'

### Feature Selection 函式

In [220]:
# 計算 feature entropy
def entropy(df,feature):  
    att_value = df[feature]  # 取出 dict 的特定 attribute 的所有資料
    value_count = Counter(att_value)          # 計算所有可能值的個數
    total = len(att_value)
    prob = [count / total for key,count in value_count.items()]  # 計算每個 attribute_value 的機率
    return -sum(p * log2(p) for p in prob)

# 計算特徵 X、Y 間的 Mutual Information
def mutual_information(df,X, Y):
    X_list = df[X]
    Y_list = df[Y]
    # 計算 X 和 Y 的熵
    H_X = entropy(df ,X)
    H_Y = entropy(df ,Y)
    # 計算 X 和 Y 的聯合機率
    joint_pairs = list(zip(X_list, Y_list))
    joint_counts = Counter(joint_pairs)
    total = len(X_list)
    joint_prob = [count / total for key,count in joint_counts.items()]
    H_X_Y = -sum(p * log2(p) for p in joint_prob)
    return H_X + H_Y - H_X_Y
    
# 計算特徵 X、Y 的 symmetric uncertainty
def cal_su(df,X,Y):
    H_X = entropy(df,X)
    H_Y = entropy(df,Y)
    if H_X == 0 and H_Y == 0:
        return 0
    return 2 * (mutual_information(df,X,Y) / (H_X + H_Y))

# 計算選取的特徵子集對於類別預測的 Goodness
def Goodness(df,feature_subset,label):
    su_X_C = 0
    sum_su_X_Y = 0  
    # 計算 feature_subset 內所有特徵對於類別值的 Symmetric uncertainty
    su_X_C = sum(cal_su(df,X,label) for X in feature_subset)
    
    # 計算 feature_subset 內所有兩兩特徵間的 Symmetric uncertainty
    for feature_i in feature_subset:
        for feature_j in feature_subset:
            sum_su_X_Y += cal_su(df,feature_i,feature_j)
    if sum_su_X_Y == 0:
        return 0
    return su_X_C / sqrt(sum_su_X_Y)

def forward_selection(df, X, y):
    select_features = []    
    best_score = 0.0        
    remaining_features = X.copy()  
    # 持續檢查直到沒有可以選擇的 feature
    i = 1   
    while(len(remaining_features) > 0):
        scores = []  
        for feature in remaining_features:
            # temp_features 暫存此次循環的特徵組合 => 上回以選取好的最佳組合 select_features + 這回新選入的一個 feature
            temp_features = select_features + [feature]
            score = Goodness(df,temp_features,y)
            # (目前的特徵組合, 新選進來的特徵, 此特徵組合的 Goodness)
            scores.append((temp_features,feature,score))

        # 依照 Goodness 排序
        scores.sort(key=lambda x: x[2], reverse = True)  
        best_new_score = float(scores[0][2])  
        print("Forward Selection:")
        if(best_new_score > best_score):
            best_score = best_new_score
            select_features = scores[0][0]  # 更新成 Goodness 最優的 subset
            if scores[0][1] in remaining_features:
                remaining_features.remove(scores[0][1])  # 移除新選特徵
            print(f"Pass{i}: best_feature_subset = {select_features} , Goodness = {best_score}")
            i += 1
        # 此輪中所有 feature_subset 的表現皆不如上一輪，Stop
        else:
            break
    print(f"Final select features: {select_features}, Goodness = {best_score}")

def backward_selection(df, X, y):
    select_features = X    
    best_score = 0.0       
    i = 1
    # 持續檢查到選擇的 feature 只剩下一個
    while(len(select_features) > 1):
        scores = [] 
        for feature in select_features:
            temp_features = select_features.copy()
            # 每次移除一個 feature
            temp_features.remove(feature)
            score = Goodness(df,temp_features,y)
            # (目前的特徵組合, 移除的特徵, 此特徵組合的 Goodness)
            scores.append((temp_features,feature,score))

        scores.sort(key = lambda x: x[2], reverse = True)  
        best_new_score = float(scores[0][2]) 
        print("Backward Selection:")
        if(best_new_score >= best_score):
            best_score = best_new_score
            select_features = scores[0][0]  # 更新成 Goodness 最優的 subset
            if scores[0][1] in select_features:
                select_features.remove(scores[0][1])  # 移除特徵
            print(f"Pass{i}: best_feature_subset = {select_features} , Goodness = {best_score}")
            print(f"remove feature: {scores[0][1]}")
            i += 1
        # 此輪中所有 feature_subset 的表現皆不如上一輪，Stop
        else:
            break
    print(f"Final select features: {select_features}, Goodness = {best_score}")

### Equal Width

In [221]:
def equal_width(df,feature,bin_num):
    # 計算每組 bin 區間
    att_value = df[feature]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num

    bins = []  # 儲存每組區間範圍
    # 計算每組區間數值範圍
    for i in range(1,bin_num):
        cut_point = min_value + i * width
        bins.append(cut_point) 
    print(f'{feature} with equal width discretization => width = {width}')
    print(bins)
    print("=========================================================")
    # 儲存 discretization 後的值
    bin_result = []
    for value in att_value:
        value = float(value)
        for i, cut_value in enumerate(bins):
            if i == 0 and value < cut_value:
                bin_result.append(i + 1)
                break
            elif i == 0 and value > cut_value and value <= bins[i + 1]:
                bin_result.append(i + 2)
                break
            elif i == bin_num - 2 and value > cut_value:
                bin_result.append(i + 2)
            elif value > cut_value and value <= bins[i + 1]:
                bin_result.append(i + 2)
                break
    return bin_result

def discretize_equal_width(df, features, bin_num):
    new_df = []
    bin_results = {}
    for feature in features:
        bin_results[feature] = equal_width(df, feature, bin_num)
    bin_results['class'] = df['class']
    return bin_results

# 對原始資料所有連續型變數做離散化並存到新的 dict of list 裡
equal_width_df = discretize_equal_width(df, X, 10)

RI with equal width discretization => width = 0.0022780000000000022
[1.513428, 1.515706, 1.517984, 1.520262, 1.52254, 1.524818, 1.527096, 1.529374, 1.531652]
Na with equal width discretization => width = 0.6649999999999998
[11.395, 12.06, 12.725, 13.39, 14.055, 14.719999999999999, 15.384999999999998, 16.049999999999997, 16.715]
Mg with equal width discretization => width = 0.449
[0.449, 0.898, 1.347, 1.796, 2.245, 2.694, 3.1430000000000002, 3.592, 4.041]
Al with equal width discretization => width = 0.321
[0.611, 0.9319999999999999, 1.2530000000000001, 1.574, 1.895, 2.216, 2.537, 2.858, 3.1790000000000003]
Si with equal width discretization => width = 0.5599999999999994
[70.37, 70.93, 71.49, 72.05, 72.61, 73.17, 73.73, 74.28999999999999, 74.85]
K with equal width discretization => width = 0.621
[0.621, 1.242, 1.863, 2.484, 3.105, 3.726, 4.3469999999999995, 4.968, 5.589]
Ca with equal width discretization => width = 1.076
[6.506, 7.582, 8.658, 9.734, 10.81, 11.886, 12.962, 14.038, 15.11

In [222]:
# 對所有特徵離散化後的資料集做 forward selection、backward selection
forward_selection(equal_width_df, X, 'class')
print("=================================================================================================")
backward_selection(equal_width_df, X, 'class')

Forward Selection:
Pass1: best_feature_subset = ['Ba'] , Goodness = 0.3042912813744425
Forward Selection:
Pass2: best_feature_subset = ['Ba', 'Mg'] , Goodness = 0.39168312637244257
Forward Selection:
Pass3: best_feature_subset = ['Ba', 'Mg', 'Ca'] , Goodness = 0.3973212810619197
Forward Selection:
Pass4: best_feature_subset = ['Ba', 'Mg', 'Ca', 'Na'] , Goodness = 0.405125062147965
Forward Selection:
Pass5: best_feature_subset = ['Ba', 'Mg', 'Ca', 'Na', 'Al'] , Goodness = 0.4111914788502334
Forward Selection:
Final select features: ['Ba', 'Mg', 'Ca', 'Na', 'Al'], Goodness = 0.4111914788502334
Backward Selection:
Pass1: best_feature_subset = ['Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'] , Goodness = 0.3929712216460038
remove feature: RI
Backward Selection:
Pass2: best_feature_subset = ['Na', 'Mg', 'Al', 'K', 'Ca', 'Ba', 'Fe'] , Goodness = 0.40295299406768176
remove feature: Si
Backward Selection:
Pass3: best_feature_subset = ['Na', 'Mg', 'Al', 'K', 'Ca', 'Ba'] , Goodness = 0.4094585798

### Equal Frequency

In [223]:
def equal_frequency(df,feature,bin_num):
    # 先記錄每個 instance 的原始索引及 value，len(df[feature]) 為資料總筆數
    att_value = [(i, float(df[feature][i])) for i in range(len(df[feature]))]
    # 以 attribute value 的值排序
    att_value.sort(key = lambda x : x[1])
    # 計算每個 bin 應該包含的 instances 數量
    frequency = len(att_value) // bin_num
    bins = [] 
    start = att_value[0][1]
    bin_index = 1    # 記錄目前 bin 
    cur_bin_cnt = 0  # 記錄目前 bin 所分配到的 value 個數
    # att_value 已排序過，故會由小到大遍歷，org_index 是紀錄該筆 instance 在未排序前的位置
    for cur,(org_index,value) in enumerate(att_value):
        # 目前 bin 的 value 數量已滿足一個 bin 所應該分配到的 frequency
        if cur_bin_cnt >= frequency and bin_index <= bin_num:
            # 且當前 value 不等於前一個 value 值
            if cur < len(att_value) and value != att_value[cur - 1][1]:
                # 若已計算到最後一個 bin
                if bin_index == bin_num:
                    bins.append((start,att_value[-1][1]))  # end 即為最後一筆 instance(最大值)
                    break
                end = att_value[cur - 1][1]  # 該 bin 的區間最大值(不包含)
                bins.append((start,end))

                # 切換到下一個 bin
                bin_index += 1
                cur_bin_cnt = 0
                start = att_value[cur - 1][1]  # att_value[i][1] 為下個 bin 的起點
        # 該 bin 裡的 instances 個數加一
        cur_bin_cnt += 1

    # 上述設定在切換下個 bin 時才將 (start,end) 進 bins
    # 有可能迴圈結束，最後一個 bin 的值個數不足一個 frequency，不會切換 bin，因此需要額外判斷防止最後一個 bin 消失
    if len(bins) < bin_num:
        end = att_value[-1][1]   # end 為最後一個元素(最大值)
        bins.append((start, end))
    print(f'{feature} with equal frequency discretization:')
    print_bins = []
    for i, (start, end) in enumerate(bins):
        if i == len(bins) - 1:
            break
        print_bins.append(end)
    print(print_bins)
    print("=========================================================")
    # 對 df 做離散化並將新值存到一個 dict of list
    org_value = df[feature]
    bin_result = []
    for value in org_value:
        value = float(value)
        for i, (start, end) in enumerate(bins):
            if i == 0 and value >= start and value <= end:
                bin_result.append(i + 1)
                break
            elif value > start and value <= end:
                bin_result.append(i + 1)
                break
    return bin_result


def discretize_equal_frequency(df, features, bin_num):
    new_df = []
    # 對每個 feature 做離散化，回傳 bin 結果(dict of list）
    bin_results = {}
    for feature in features:
        bin_results[feature] = equal_frequency(df, feature, bin_num)
    bin_results['class'] = df['class']
    return bin_results

equal_frequency_df = discretize_equal_frequency(df,X,10)


RI with equal frequency discretization:
[1.5159, 1.51629, 1.51667, 1.51732, 1.51766, 1.51808, 1.51852, 1.51977, 1.52177]
Na with equal frequency discretization:
[12.67, 12.85, 13.0, 13.2, 13.33, 13.49, 13.73, 14.14, 14.56]
Mg with equal frequency discretization:
[0.0, 2.72, 3.36, 3.48, 3.54, 3.59, 3.66, 3.82]
Al with equal frequency discretization:
[0.83, 1.14, 1.23, 1.3, 1.38, 1.51, 1.58, 1.8, 2.12]
Si with equal frequency discretization:
[71.77, 72.12, 72.38, 72.65, 72.78, 72.89, 73.01, 73.11, 73.28]
K with equal frequency discretization:
[0.0, 0.11, 0.33, 0.52, 0.56, 0.58, 0.61, 0.66, 1.41]
Ca with equal frequency discretization:
[7.96, 8.11, 8.32, 8.44, 8.6, 8.78, 9.02, 9.57, 10.88]
Ba with equal frequency discretization:
[0.0, 0.76]
Fe with equal frequency discretization:
[0.0, 0.11, 0.19, 0.32]


In [224]:
# 對所有特徵離散化後的資料集做 forward selection、backward selection
forward_selection(equal_frequency_df, X, 'class')
print("=================================================================================================")
backward_selection(equal_frequency_df, X, 'class')

Forward Selection:
Pass1: best_feature_subset = ['Mg'] , Goodness = 0.24851240850960837
Forward Selection:
Pass2: best_feature_subset = ['Mg', 'Ba'] , Goodness = 0.3223383424643906
Forward Selection:
Pass3: best_feature_subset = ['Mg', 'Ba', 'Al'] , Goodness = 0.3611469799776129
Forward Selection:
Pass4: best_feature_subset = ['Mg', 'Ba', 'Al', 'RI'] , Goodness = 0.3833761038884968
Forward Selection:
Pass5: best_feature_subset = ['Mg', 'Ba', 'Al', 'RI', 'K'] , Goodness = 0.3960303164266635
Forward Selection:
Pass6: best_feature_subset = ['Mg', 'Ba', 'Al', 'RI', 'K', 'Ca'] , Goodness = 0.39697499942233605
Forward Selection:
Pass7: best_feature_subset = ['Mg', 'Ba', 'Al', 'RI', 'K', 'Ca', 'Na'] , Goodness = 0.3973256378646354
Forward Selection:
Final select features: ['Mg', 'Ba', 'Al', 'RI', 'K', 'Ca', 'Na'], Goodness = 0.3973256378646354
Backward Selection:
Pass1: best_feature_subset = ['RI', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Ba', 'Fe'] , Goodness = 0.39160854990249794
remove feature: Si
Ba

### Entropy Based

In [225]:
# 計算區間內類別值的 entropy
def Ent(df,class_label):
    class_value = df[class_label]
    cnt = Counter(class_value)
    prob = [count / len(class_value) for i,count in cnt.items()]
    Ent = -sum(p * log2(p) for p in prob) 
    return Ent

# cut_index 為資料被分割成兩個 subset 時，「右側區間的起始 index」，此函式計算切割後的資訊增益
def info_gain(df, midpoint, feature, class_label):
    total = len(df[class_label])

    # 依據 feature 值是否 <= midpoint 來分割資料
    left = {class_label: [df[class_label][i] for i in range(len(df[feature])) if float(df[feature][i]) <= midpoint]}
    right = {class_label: [df[class_label][i] for i in range(len(df[feature])) if float(df[feature][i]) > midpoint]}

    # 若任一邊為空，代表不是有效切點，資訊增益為 0
    if not left[class_label] or not right[class_label]:
        return 0

    Ent_cut = (len(left[class_label]) / total) * Ent(left, class_label) + (len(right[class_label]) / total) * Ent(right, class_label)

    return Ent(df, class_label) - Ent_cut

# 找 feature 的最佳切點
def find_cut_point(df, feature, class_label):
    best_info_gain = -1
    best_cut_value = None

    # 整理為 (value, label) 配對並排序
    value_label = [(float(df[feature][i]), df[class_label][i]) for i in range(len(df[feature]))]
    value_label = sorted(value_label, key=lambda x: x[0])

    for i in range(1, len(value_label)):
        if value_label[i][1] != value_label[i - 1][1]:  # 類別不同才考慮切點
            midpoint = (value_label[i][0] + value_label[i - 1][0]) / 2
            cur_info_gain = info_gain(df, midpoint, feature, class_label)
            if cur_info_gain > best_info_gain:
                best_info_gain = cur_info_gain
                best_cut_value = midpoint

    return best_info_gain, best_cut_value

# 對整個 df 的 feature 欄位做 entropy_base 切割，找所有切點，返回切割點 list
def split(df,feature,class_label,cut_points):
    # 若傳進來的 cut_points 為空，代表區間無可用的切割點
    if cut_points is None:
        cut_points = []
    best_info_gain,best_cut_value = find_cut_point(df,feature,class_label)

    # 若區間的 class 值或 feature 值都是一樣的，或只剩一個 instance，代表切割無意義
    if (len(df[class_label]) <= 1 or best_cut_value is None or
        len(set(df[class_label])) == 1 or len(set(df[feature])) == 1):
        return None
    
    # 創造兩個 dict of list 儲存切割後的 feature 與 class 欄位
    left_set = {feature : [],class_label : []}
    right_set = {feature : [],class_label : []}
    for i in range(len(df[feature])):
        # 分配 instances 至對應的區間
        value = float(df[feature][i])
        if value <= best_cut_value:
            left_set[feature].append(value)
            left_set[class_label].append(df[class_label][i])
        else:
            right_set[feature].append(value)
            right_set[class_label].append(df[class_label][i])
    # 切割完，若任一區間沒有資料，或分割後區間內容與分割前一樣，代表分割沒有幫助 
    if (len(left_set[feature]) == 0 or len(right_set[feature]) == 0 or
        len(left_set[feature]) == len(df[feature]) or len(right_set[feature]) == len(df[feature])):
        return None
    # 計算 MDLPC criterion 的 threshold

    # 計算初始區間、切割後的左右區間個包含的 class 種類數量
    k = len(set(df[class_label]))
    k1 = len(set(left_set[class_label]))
    k2 = len(set(right_set[class_label]))
    N = len(df[feature])
    H_S = Ent(df, class_label)
    H_l = Ent(left_set, class_label)
    H_r = Ent(right_set, class_label)
    delta = log2(3 ** k - 2) - (k * H_S - k1 * H_l - k2 * H_r)
    threshold = log2(N - 1) / N + delta / N

    # 最好的切割點的 gain 未超過 threshold，停止
    if best_info_gain <= threshold :
        return []
    
    cut_points.append(best_cut_value)
    # 對左右區間遞迴做 entropy_base 切割
    split(left_set,feature,class_label,cut_points)
    split(right_set,feature,class_label,cut_points)
    return cut_points

In [226]:
# 照上述的切點做 discretization
def discretize_entropy_base(df,features,class_label):
    bin_results = {}
    for feature in features:
        print(f'{feature} with entropy base discretization:')    
        cut_points = []
        cut_points = split(df,feature,"class",cut_points)
        # 若沒有切點，代表整個特徵內容會被離散為同個類別
        if cut_points == []:
            ent_base_res = [1] * len(df[feature])
            bin_results[feature] = ent_base_res
        else:
            cut_points = sorted(cut_points)
            org_value = df[feature]     # 紀錄特徵值的原始值
            ent_base_res = []           # 紀錄離散化後的特徵值
            # 對每個 feature 值做離散化分配
            for value in org_value:
                value = float(value)
                for i,cut_point in enumerate(cut_points):
                    # 只有一個切割點，只會被切成兩個區間
                    if len(cut_points) == 1:
                        if value <= cut_point:
                            ent_base_res.append(i + 1)
                        else:
                            ent_base_res.append(i + 2)
                    # 有兩個以上的切點
                    else:
                        if i == 0 and value <= cut_point:
                            ent_base_res.append(i + 1)
                        elif i == 0 and value > cut_point and value <= cut_points[i + 1]:
                            ent_base_res.append(i + 2)
                        elif i == len(cut_points) - 1 and value > cut_point:
                            ent_base_res.append(i + 2)
                        elif value > cut_point and value <= cut_points[i + 1]:
                            ent_base_res.append(i + 2)
                       
            bin_results[feature] = ent_base_res
        print(f"{cut_points}")
        print("=========================================================")
    bin_results['class'] = df['class']
    return bin_results
entropy_base_df = discretize_entropy_base(df,X,'class')

RI with entropy base discretization:
[1.517335, 1.517985]
Na with entropy base discretization:
[14.065]
Mg with entropy base discretization:
[2.6950000000000003]
Al with entropy base discretization:
[1.38, 1.76]
Si with entropy base discretization:
[]
K with entropy base discretization:
[0.055, 0.61, 0.745]
Ca with entropy base discretization:
[7.02, 8.28, 10.075]
Ba with entropy base discretization:
[0.335]
Fe with entropy base discretization:
[]


In [227]:
# 對所有特徵離散化後的資料集做 forward selection、backward selection
forward_selection(entropy_base_df, X, 'class')
print("=================================================================================================")
backward_selection(entropy_base_df, X, 'class')

Forward Selection:
Pass1: best_feature_subset = ['Mg'] , Goodness = 0.37040112253047947
Forward Selection:
Pass2: best_feature_subset = ['Mg', 'Al'] , Goodness = 0.44111979963245035
Forward Selection:
Pass3: best_feature_subset = ['Mg', 'Al', 'Ca'] , Goodness = 0.4684573941293161
Forward Selection:
Pass4: best_feature_subset = ['Mg', 'Al', 'Ca', 'Ba'] , Goodness = 0.4923013144646698
Forward Selection:
Pass5: best_feature_subset = ['Mg', 'Al', 'Ca', 'Ba', 'K'] , Goodness = 0.5084334732749055
Forward Selection:
Final select features: ['Mg', 'Al', 'Ca', 'Ba', 'K'], Goodness = 0.5084334732749055
Backward Selection:
Pass1: best_feature_subset = ['RI', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Ba', 'Fe'] , Goodness = 0.5106082375130458
remove feature: Si
Backward Selection:
Pass2: best_feature_subset = ['RI', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Ba'] , Goodness = 0.5106082375130458
remove feature: Fe
Backward Selection:
Final select features: ['RI', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Ba'], Goodness = 0.510608237513