In [1]:
import csv
from math import sqrt
from math import log2
from collections import Counter,defaultdict
import random

In [2]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')       

In [3]:
def table_to_column_dict(data, columns):
    # 初始化一個空的 dictionary，key 是 attributes 名稱，value 是空 list：
    df_dict = {col: [] for col in columns}
    # 對每一列資料逐欄掃描，同時把欄位名稱 (col) 跟對應值 (val) 配對起來
    for row in data:
        for col, val in zip(columns, row):
                val = float(val)
                df_dict[col].append(val)
    return df_dict
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
features = columns.copy()
features.remove('Id')
features.remove("class")
df = table_to_column_dict(data,columns)
# 移除 Id 欄位
del df["Id"]

In [4]:
def equal_width(df, feature, bin_num):
    # 計算每組 bin 區間
    att_value = df[feature]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num
    
    # 計算切分點
    bins = [min_value + i * width for i in range(1, bin_num)]
    
    print(f'{feature} with equal width discretization => width = {width}')
    print(bins)
    print("=========================================================")
    # 儲存 discretization 後的值
    bin_result = []
    for value in att_value:
        value = float(value)
        # 找到第一個大於 value 的切分點索引
        bin_index = 0
        for i, cut_value in enumerate(bins):
            if value <= cut_value:
                bin_index = i + 1
                break
            elif i == len(bins) - 1:  # 如果是最後一個切分點
                bin_index = i + 2
        bin_result.append(bin_index)
    return bin_result
def discretize_equal_width(df, features, bin_num):
    new_df = []
    bin_results = {}
    for feature in features:
        bin_results[feature] = equal_width(df, feature, bin_num)
    bin_results['class'] = df['class']
    return bin_results

# 對原始資料所有連續型變數做離散化並存到新的 dict of list 裡

equal_width_df = discretize_equal_width(df, features, 10)

RI with equal width discretization => width = 0.0022780000000000022
[1.513428, 1.515706, 1.517984, 1.520262, 1.52254, 1.524818, 1.527096, 1.529374, 1.531652]
Na with equal width discretization => width = 0.6649999999999998
[11.395, 12.06, 12.725, 13.39, 14.055, 14.719999999999999, 15.384999999999998, 16.049999999999997, 16.715]
Mg with equal width discretization => width = 0.449
[0.449, 0.898, 1.347, 1.796, 2.245, 2.694, 3.1430000000000002, 3.592, 4.041]
Al with equal width discretization => width = 0.321
[0.611, 0.9319999999999999, 1.2530000000000001, 1.574, 1.895, 2.216, 2.537, 2.858, 3.1790000000000003]
Si with equal width discretization => width = 0.5599999999999994
[70.37, 70.93, 71.49, 72.05, 72.61, 73.17, 73.73, 74.28999999999999, 74.85]
K with equal width discretization => width = 0.621
[0.621, 1.242, 1.863, 2.484, 3.105, 3.726, 4.3469999999999995, 4.968, 5.589]
Ca with equal width discretization => width = 1.076
[6.506, 7.582, 8.658, 9.734, 10.81, 11.886, 12.962, 14.038, 15.11

In [90]:
# 建立 naive bayse 模型

def NaiveBayes(df, training_set, class_name):
   
    # 計算每個類別的先驗機率 P(class_name = C_j)
    N = len(training_set[class_name])
    class_values = list(set(df[class_name]))
    prior_C = {c : training_set[class_name].count(c) / N for c in class_values}

    features = list(df.keys())
    features.remove(class_name)
    # 建立一個巢狀的 defaultdict 來儲存條件機率 P(Xi|Cj)
    # 第一層 key 是類別 Cj
    # 第二層 key 是特徵 Xi 
    # 第三層 key 是特徵值，value 是該特徵值在類別 Cj 下的條件機率
    p_Xi_Cj_dict = defaultdict(lambda: defaultdict(dict))
    # 取得所有特徵的可能的特徵值
    possible_Xi_values = {Xi: set(df[Xi]) for Xi in features}
    total_Cj = len(set(df[class_name]))
    for Cj in class_values:
        # 取出 training_set 中類別 == Cj 的樣本的位址
        Cj_index = [i for i, val in enumerate(training_set[class_name]) if val == Cj]

        for Xi in features:
            # 計算類別為 Cj 的訓練資料中，Xi 特徵可能出現的值的個數
            Xi_value_count = Counter([training_set[Xi][i] for i in Cj_index])

            # 計算 Xi 特徵每種可能的值出現在 Cj 類別的機率 (包括 training_set 未出現過的可能值)
            for value in possible_Xi_values[Xi]:
                # 若 value 不存在則預設為 0
                count = Xi_value_count.get(value, 0)
                # 做 laplace 平滑化
                p_Xi_Cj_dict[Cj][Xi][value] = (count + 1) / (total_Cj + 10)
    return prior_C, p_Xi_Cj_dict

def ensemble_model_with_bagging(df, m, training_set, test_set, class_name, test_indices):
   
    N = len(training_set[class_name])
    # 儲存 training_set 所計算出的 P(Xi | Cj) 機率
    p_Xi_Cj_dict = defaultdict(lambda: defaultdict(dict))
    prediction = defaultdict(list)
    # 集成 m 個 base models 的預測結果  
    for _ in range(m):
        # 生成 0 ~ N-1 範圍內的隨機亂數，總共生成 N 個
        # 這步驟代表 bagging 的取後放回抽樣，陣列裡的每個元素即為抽到的訓練集樣本索引
        sampled_indices = [random.randint(0, N - 1) for _ in range(N)]
     
        # 每次的 bagging sample 會成為每個 base model 所使用到的訓練集 
        bagging_sample = {key: [training_set[key][i] for i in sampled_indices] for key in training_set.keys()}
        print(sampled_indices)
        # 計算該 base model 的先驗機率 P(Cj) 與 P(Xi | Cj)
        prior_C, p_Xi_Cj_dict = NaiveBayes(df, bagging_sample, class_name)
        features = list(df.keys())
        features.remove(class_name)
        class_values = list(set(df[class_name]))

        # 預測 test_set
        for i in range(len(test_set[class_name])):
            max_prob = 0
            # 取出第 i 個 test_set 樣本的所有特徵的值
            instance = {f : test_set[f][i] for f in features} 
            predict_j = None
            # 計算該特徵值組合在 Cj 類別下發生機率
            for Cj in class_values:
                p_Xi_Cj = 1
                for Xi in features:  
                    p_Xi_Cj *=  p_Xi_Cj_dict[Cj][Xi][instance[Xi]]
                posterior_prob = prior_C[Cj] * p_Xi_Cj
                if posterior_prob > max_prob:
                    predict_j = Cj
                    max_prob = posterior_prob
            # 在原始資料中第 test_indices[i] 個位址預測為 predict_j
            prediction[test_indices[i]].append(predict_j)
            
 
    final_prediction = {}
    for index, value in prediction.items():
        # most_common() 只取出現次數最多的元素，取第 1 個 tutple 的第一個元素 (即類別值)
        # most_common() 輸出會像 'Cj' : 5
        final_prediction[index] = Counter(value).most_common()[0][0]
    
    return final_prediction

def five_folds_cv(df, base, class_name):
    n = 214
    indices = list(range(n))
     # 隨機打亂索引
    rng = random.Random(49)
    rng.shuffle(indices)
    fold_sizes = [43,43,43,43,42]
    # 儲存每個 folds 所包含的樣本索引值
    folds = [[] for _ in range(5)]
    
    # 將打亂的 index  一一分配到 folds 中
    current_index = 0
    for fold_index, fold_size in enumerate(fold_sizes):
        folds[fold_index] = indices[current_index : current_index + fold_size]
        current_index += fold_size

    # 對應到實際資料
    fold_data = []  # 儲存 five-folds 中每個 fold 的資料
    for fold_indice in folds:
        fold = {key : [df[key][i] for i in fold_indice] for key in df.keys()}
        fold_data.append(fold)
    # 進行 5-folds 預測，all_prediction 儲存五次預測的 test_set 結果
    all_prediction = {}
    for k in range(5):
        # fold_data 是一個陣列，fold_data[k] 即代表第 k 個 fold 的資料
        test_set = fold_data[k]
        # k - 1 個 folds 合併成 training_set
        training_set = {key: [] for key in df.keys()}
        for i in range(5):
            if i != k:
                # 將其餘四個fold的個別特徵值合併
                for key in df.keys():
                    training_set[key].extend(fold_data[i][key])
        # 用集成模型預測
        final_prediction = ensemble_model_with_bagging(df, base, training_set, test_set, class_name, folds[k])
        
        all_prediction[k] = final_prediction
    print(f"Ensemble model with {base} base models:")
    avg = 0
    # 計算每個 fold 的準確率
    for k in range(5):
        correct = 0
        for key, value in all_prediction[k].items():
            if value == df[class_name][key]:
                correct += 1
        print(f"Fold {k + 1} => accuracy = {correct}/{len(all_prediction[k])} = {correct/len(all_prediction[k])}")
        avg += correct/len(all_prediction[k])
    print(avg / 5)
    return

five_folds_cv(equal_width_df, 5 ,'class')

[7, 2, 86, 119, 121, 154, 80, 7, 15, 24, 126, 62, 34, 85, 144, 3, 6, 118, 55, 157, 118, 22, 48, 8, 167, 115, 43, 41, 23, 27, 52, 138, 97, 144, 10, 130, 120, 19, 167, 99, 126, 36, 47, 30, 6, 64, 80, 71, 104, 28, 126, 28, 145, 42, 140, 17, 121, 1, 116, 65, 78, 62, 104, 13, 37, 57, 77, 75, 33, 47, 92, 36, 23, 165, 101, 3, 36, 87, 46, 54, 150, 69, 21, 36, 170, 4, 149, 69, 78, 114, 121, 46, 96, 157, 5, 53, 46, 107, 164, 160, 90, 80, 24, 57, 33, 120, 104, 20, 166, 119, 154, 76, 142, 6, 34, 69, 88, 91, 149, 138, 89, 28, 97, 9, 33, 27, 27, 147, 170, 28, 81, 137, 148, 23, 43, 110, 138, 103, 73, 155, 137, 105, 65, 122, 142, 109, 131, 107, 32, 54, 118, 97, 15, 2, 128, 134, 68, 79, 116, 66, 6, 130, 59, 112, 168, 142, 22, 144, 28, 146, 8]
[125, 74, 158, 145, 33, 21, 111, 76, 11, 12, 103, 146, 135, 51, 100, 155, 160, 32, 50, 83, 58, 71, 138, 68, 130, 111, 69, 114, 10, 129, 40, 30, 57, 146, 135, 105, 81, 70, 110, 47, 37, 2, 43, 16, 42, 51, 125, 37, 3, 91, 84, 0, 53, 55, 0, 15, 5, 4, 140, 132, 96, 63,