In [1]:
import csv
from math import sqrt
from math import log2
from collections import Counter,defaultdict
import random


In [2]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('glass.txt')

In [3]:
def table_to_column_dict(data, columns):
    # 初始化一個空的 dictionary，key 是 attributes 名稱，value 是空 list：
    df_dict = {col: [] for col in columns}
    # 對每一列資料逐欄掃描，同時把欄位名稱 (col) 跟對應值 (val) 配對起來
    for row in data:
        for col, val in zip(columns, row):
                val = float(val)
                df_dict[col].append(val)
    return df_dict
# 定義欄位名稱
columns = ["Id","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","class"]
df = table_to_column_dict(data,columns)

In [4]:
X = columns.copy()
X.remove("Id")
X.remove("class")

In [5]:
def equal_width(df,feature,bin_num):
    # 計算每組 bin 區間
    att_value = df[feature]
    max_value = float(max(att_value))
    min_value = float(min(att_value))
    # 計算每組區間寬度
    width = (max_value - min_value) / bin_num

    bins = []  # 儲存每組區間範圍
    # 計算每組區間數值範圍
    for i in range(1, bin_num):
        cut_point = min_value + i * width  # 取到小數點第5位
        bins.append(cut_point)

    print(f'{feature} with equal width discretization => width = {round(width, 5)}')
    print(bins)
    print("=========================================================")
    # 儲存 discretization 後的值
    bin_result = []
    for value in att_value:
        value = float(value)
        for i, cut_value in enumerate(bins):
            if i == 0 and value <= cut_value:
                bin_result.append(i + 1)
                break
            elif i == 0 and value > cut_value and value <= bins[i + 1]:
                bin_result.append(i + 2)
                break
            elif i == bin_num - 2 and value > cut_value:
                bin_result.append(i + 2)
            elif i == bin_num - 2 and value <= cut_value and value > bin[i - 1]:
                bin_result.append(i + 1)
            elif value > cut_value and value <= bins[i + 1]:
                bin_result.append(i + 2)
                break
    return bin_result

def discretize_equal_width(df, features, bin_num):
    new_df = []
    bin_results = {}
    for feature in features:
        bin_results[feature] = equal_width(df, feature, bin_num)
    bin_results['class'] = df['class']
    return bin_results

# 對原始資料所有連續型變數做離散化並存到新的 dict of list 裡
equal_width_df = discretize_equal_width(df, X, 10)

RI with equal width discretization => width = 0.00228
[1.513428, 1.515706, 1.517984, 1.520262, 1.52254, 1.524818, 1.527096, 1.529374, 1.531652]
Na with equal width discretization => width = 0.665
[11.395, 12.06, 12.725, 13.39, 14.055, 14.719999999999999, 15.384999999999998, 16.049999999999997, 16.715]
Mg with equal width discretization => width = 0.449
[0.449, 0.898, 1.347, 1.796, 2.245, 2.694, 3.1430000000000002, 3.592, 4.041]
Al with equal width discretization => width = 0.321
[0.611, 0.9319999999999999, 1.2530000000000001, 1.574, 1.895, 2.216, 2.537, 2.858, 3.1790000000000003]
Si with equal width discretization => width = 0.56
[70.37, 70.93, 71.49, 72.05, 72.61, 73.17, 73.73, 74.28999999999999, 74.85]
K with equal width discretization => width = 0.621
[0.621, 1.242, 1.863, 2.484, 3.105, 3.726, 4.3469999999999995, 4.968, 5.589]
Ca with equal width discretization => width = 1.076
[6.506, 7.582, 8.658, 9.734, 10.81, 11.886, 12.962, 14.038, 15.114]
Ba with equal width discretization => 

In [6]:
# 建立 naive bayse 模型
def NaiveBayes(df, training_set, class_name):
    # 計算每個類別的先驗機率 P(class_name = C_j)
    N = len(training_set[class_name])
    class_values = list(set(training_set[class_name]))
    prior_C = {c : training_set[class_name].count(c) / N for c in class_values}
    features = list(df.keys())
    features.remove(class_name)
    p_Xi_Cj_dict = defaultdict(lambda: defaultdict(dict))
    # 取得所有特徵的可能的特徵值
    possible_Xi_values = {Xi: set(df[Xi]) for Xi in features}
    for Cj in class_values:
        # 取出 training_set 中類別 == Cj 的樣本的位址
        Cj_index = [i for i in range(N) if training_set[class_name][i] == Cj]

        for Xi in features:
            # 計算類別為 Cj 的訓練資料中，Xi 特徵可能出現的值的個數
            Xi_value_count = Counter([training_set[Xi][i] for i in Cj_index])
            total_Cj = len(Cj_index)
        for value in possible_Xi_values[Xi]:
            count = Xi_value_count.get(value, 0)
            # 做 laplace 平滑化
            p_Xi_Cj_dict[Cj][Xi][value] = (count + 1) / (total_Cj + len(set(training_set[Xi])))
    return prior_C, p_Xi_Cj_dict

In [None]:
def ensemble_model_with_bagging(df, m, training_set, test_set,class_name):
    N = len(training_set[class_name])
    p_Xi_Cj_dict = defaultdict(lambda: defaultdict(dict))
    prediction = defaultdict(dict)
    for _ in range(m):
        sampled_indices = [random.randint(0, N - 1) for _ in range(N)]
        bagging_sample = {key: [training_set[key][i] for i in sampled_indices] for key in training_set.keys()}
        prior_C, p_Xi_Cj_dict = NaiveBayes(equal_width_df, bagging_sample, class_name)
        features = list(df.keys())
        features.remove(class_name)
        class_values = list(set(df[class_name]))
        for test in len(test_set):
            max_prob = 0
            instance = {f : test_set[f][test] for f in features} 
            predict_j = None
            for Cj in class_values:
                p_Xi_Cj = 1
                for Xi in features:
                    p_Xi_Cj *=  p_Xi_Cj_dict[Cj][Xi][instance[Xi]]
                posterior_prob = prior_C[Cj] * p_Xi_Cj
                if posterior_prob > max_prob:
                    predict_j = Cj
                    max_prob = posterior_prob
            prediction[test].append(predict_j)
    final_prediction = []
    for index, value in prediction.items():
        # most_common() 只取出現次數最多的元素，取第 1 個 tutple 的第一個元素 (即類別值)
        # most_common() 輸出會像 'Cj' : 5
        final_prediction[index] = Counter(value).most_common()[0][0]
    return final_prediction

In [16]:
def five_folds_cv(df, class_name):
    n = 214
    indices = list(range(n))
     # 隨機打亂索引
    random.shuffle(indices)

    fold_sizes = [43,43,43,43,42]
    folds = [[] for _ in range(5)]
    
    # 將打亂的 index  一一分配到 folds 中
    current_index = 0
    for fold_index, fold_size in enumerate(fold_sizes):
        folds[fold_index] = indices[current_index : current_index + fold_size]
        current_index += fold_size

    # 對應到實際資料
    fold_data = []  # 儲存 five-folds 中每個 fold 的資料
    for fold_indice in folds:
        fold = {key : [df[key][i] for i in fold_indice] for key in df.keys()}
        fold_data.append(fold)
    return fold_data



In [17]:
five_folds_cv(equal_width_df,'class')

[{'RI': [3,
   4,
   3,
   4,
   3,
   5,
   3,
   3,
   7,
   4,
   3,
   3,
   1,
   3,
   4,
   3,
   3,
   4,
   3,
   4,
   3,
   4,
   3,
   3,
   3,
   5,
   6,
   3,
   4,
   5,
   3,
   5,
   3,
   3,
   7,
   4,
   5,
   3,
   5,
   3,
   5,
   4,
   3],
  'Na': [7,
   4,
   4,
   5,
   7,
   5,
   4,
   4,
   1,
   4,
   4,
   4,
   4,
   5,
   5,
   4,
   3,
   4,
   5,
   4,
   4,
   5,
   4,
   3,
   6,
   2,
   5,
   5,
   5,
   1,
   7,
   5,
   3,
   5,
   5,
   4,
   4,
   7,
   4,
   4,
   5,
   4,
   4],
  'Mg': [1,
   8,
   9,
   4,
   1,
   9,
   8,
   9,
   1,
   7,
   8,
   8,
   8,
   8,
   9,
   8,
   8,
   9,
   4,
   8,
   8,
   1,
   8,
   8,
   1,
   5,
   9,
   9,
   9,
   4,
   1,
   10,
   8,
   8,
   1,
   9,
   1,
   9,
   9,
   7,
   8,
   7,
   8],
  'Al': [7,
   4,
   3,
   5,
   5,
   2,
   4,
   4,
   2,
   4,
   4,
   3,
   3,
   5,
   3,
   5,
   3,
   4,
   5,
   4,
   4,
   1,
   5,
   4,
   6,
   4,
   1,
   3,
   3,
   4,
   6,
   3,
   4,
