In [None]:
import csv
from math import sqrt
from math import log2
from collections import Counter

In [87]:
# 讀取 txt 檔案
def load_csv(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_csv('breast-cancer.txt')

In [None]:
data

[["'40-49'",
  "'premeno'",
  "'15-19'",
  "'0-2'",
  "'yes'",
  "'3'",
  "'right'",
  "'left_up'",
  "'no'",
  "'recurrence-events'"],
 ["'50-59'",
  "'ge40'",
  "'15-19'",
  "'0-2'",
  "'no'",
  "'1'",
  "'right'",
  "'central'",
  "'no'",
  "'no-recurrence-events'"],
 ["'50-59'",
  "'ge40'",
  "'35-39'",
  "'0-2'",
  "'no'",
  "'2'",
  "'left'",
  "'left_low'",
  "'no'",
  "'recurrence-events'"],
 ["'40-49'",
  "'premeno'",
  "'35-39'",
  "'0-2'",
  "'yes'",
  "'3'",
  "'right'",
  "'left_low'",
  "'yes'",
  "'no-recurrence-events'"],
 ["'40-49'",
  "'premeno'",
  "'30-34'",
  "'3-5'",
  "'yes'",
  "'2'",
  "'left'",
  "'right_up'",
  "'no'",
  "'recurrence-events'"],
 ["'50-59'",
  "'premeno'",
  "'25-29'",
  "'3-5'",
  "'no'",
  "'2'",
  "'right'",
  "'left_up'",
  "'yes'",
  "'no-recurrence-events'"],
 ["'50-59'",
  "'ge40'",
  "'40-44'",
  "'0-2'",
  "'no'",
  "'3'",
  "'left'",
  "'left_up'",
  "'no'",
  "'no-recurrence-events'"],
 ["'40-49'",
  "'premeno'",
  "'10-14'",
  "'0-

In [92]:
0# 定義欄位名稱
columns = ["age", "menopause", "tumor_size", "inv_nodes", "node_caps", 
           "deg_malig", "breast", "breast_quad", "irradiat", "class"]
# 轉換成字典
df = [dict(zip(columns, row)) for row in data]
df

[{'age': "'40-49'",
  'menopause': "'premeno'",
  'tumor_size': "'15-19'",
  'inv_nodes': "'0-2'",
  'node_caps': "'yes'",
  'deg_malig': "'3'",
  'breast': "'right'",
  'breast_quad': "'left_up'",
  'irradiat': "'no'",
  'class': "'recurrence-events'"},
 {'age': "'50-59'",
  'menopause': "'ge40'",
  'tumor_size': "'15-19'",
  'inv_nodes': "'0-2'",
  'node_caps': "'no'",
  'deg_malig': "'1'",
  'breast': "'right'",
  'breast_quad': "'central'",
  'irradiat': "'no'",
  'class': "'no-recurrence-events'"},
 {'age': "'50-59'",
  'menopause': "'ge40'",
  'tumor_size': "'35-39'",
  'inv_nodes': "'0-2'",
  'node_caps': "'no'",
  'deg_malig': "'2'",
  'breast': "'left'",
  'breast_quad': "'left_low'",
  'irradiat': "'no'",
  'class': "'recurrence-events'"},
 {'age': "'40-49'",
  'menopause': "'premeno'",
  'tumor_size': "'35-39'",
  'inv_nodes': "'0-2'",
  'node_caps': "'yes'",
  'deg_malig': "'3'",
  'breast': "'right'",
  'breast_quad': "'left_low'",
  'irradiat': "'yes'",
  'class': "'no-re

In [94]:
# 定義欄位名稱
columns = ["age", "menopause", "tumor_size", "inv_nodes", "node_caps", 
           "deg_malig", "breast", "breast_quad", "irradiat", "class"]
# 轉換成字典
structured_data = [dict(zip(columns, row)) for row in data]

column = [row['age'] for row in structured_data]

count = Counter(column)
print(count)
total = len(column)
print(count.items())
prob = [count_value / total for key,count_value in count.items()]
prob

Counter({"'50-59'": 91, "'40-49'": 89, "'60-69'": 55, "'30-39'": 36, "'70-79'": 5, "'20-29'": 1})
dict_items([("'40-49'", 89), ("'50-59'", 91), ("'60-69'", 55), ("'30-39'", 36), ("'70-79'", 5), ("'20-29'", 1)])


[0.3212996389891697,
 0.3285198555956679,
 0.19855595667870035,
 0.1299638989169675,
 0.018050541516245487,
 0.0036101083032490976]

In [96]:
entropy(df,'age')

2.033412518452412

In [None]:
# 計算 feature entropy
def entropy(df,feature):
    attribute_value = [row[feature] for row in df]  # 抓取特定 feature 的所有值 
    value_count = Counter(attribute_value)          # 計算所有可能值的個數
    total = len(df)
    prob = [count / total for key,count in value_count.items()]  # 計算每個 attribute_value 的機率
    return -sum(p * log2(p) for p in prob)
    
# 計算特徵 X、Y 間的 Mutual Information
def mutual_information(df,X, Y):
    # 計算 X 和 Y 的熵
    H_X = entropy(df,X)
    H_Y = entropy(df,Y)
    # 計算 X 和 Y 的聯合機率
    joint_counts = Counter((row[X], row[Y]) for row in df)  # 計算 X、Y 一起出現的次數
    total = len(df)
    joint_prob = [count / total for key,count in joint_counts.items()]
    H_X_Y = -sum(joint_prob * log2(joint_prob))
    return H_X + H_Y - H_X_Y
    
# 計算特徵 X、Y 的 symmetric uncertainty
def cal_su(df,X,Y):
    H_X = entropy(df,X)
    H_Y = entropy(df,Y)
    return 2 * (mutual_information(df,X,Y) / (H_X + H_Y))

# 計算選取的特徵子集對於類別預測的 Goodness
def Goodness(feature_subset,label):
    su_X_C = 0
    sum_su_X_Y = 0  
    # 計算 feature_subset 內所有特徵對於類別值的 Symmetric uncertainty
    su_X_C = sum(cal_su(X,label) for X in feature_subset)
    
    # 計算 feature_subset 內所有兩兩特徵間的 Symmetric uncertainty
    for feature_i in feature_subset:
        for feature_j in feature_subset:
            sum_su_X_Y += cal_su(feature_i,feature_j)
    return su_X_C / sqrt(sum_su_X_Y)


forward selection

In [None]:
# X 代表資料集的特徵集合，y 則是類別值
def forward_selection(df, X, y):
    select_features = []
    best_score = 0
    remaining_features = list(range(X.shpae(1)))
    while len(remaining_features) > 0:
        scores = []
        for feature in remaining_features:
            temp_features = select_features + [feature]
            score = Goodness(temp_features,y)
            scores.append((temp_features,scores))

        scores.sort(reverse = True)
        best_new_features,best_new_score = scores[0]
        [90,80,70]

        if best_new_score > best_score:
            best_score = best_new_score

9
