In [22]:
import csv
from math import sqrt
from math import log2
from math import log
from collections import Counter

In [23]:
# 讀取 txt 檔案
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = [row for row in reader]
    return data
# 讀取資料
data = load_data('breast-cancer.txt')

In [24]:
# 定義欄位名稱
columns = ["age", "menopause", "tumor_size", "inv_nodes", "node_caps", 
           "deg_malig", "breast", "breast_quad", "irradiat", "class"]
# 轉換成字典
df = [dict(zip(columns, row)) for row in data]
df[0:2]

[{'age': "'40-49'",
  'menopause': "'premeno'",
  'tumor_size': "'15-19'",
  'inv_nodes': "'0-2'",
  'node_caps': "'yes'",
  'deg_malig': "'3'",
  'breast': "'right'",
  'breast_quad': "'left_up'",
  'irradiat': "'no'",
  'class': "'recurrence-events'"},
 {'age': "'50-59'",
  'menopause': "'ge40'",
  'tumor_size': "'15-19'",
  'inv_nodes': "'0-2'",
  'node_caps': "'no'",
  'deg_malig': "'1'",
  'breast': "'right'",
  'breast_quad': "'central'",
  'irradiat': "'no'",
  'class': "'no-recurrence-events'"}]

### 定義各式計算公式

In [25]:
# 計算 feature entropy
def entropy(df,feature):  
    attribute_value = [row[feature] for row in df]  # 抓取特定 feature 的所有值 
    value_count = Counter(attribute_value)          # 計算所有可能值的個數
    total = len(df)
    prob = [count / total for key,count in value_count.items()]  # 計算每個 attribute_value 的機率
    return -sum(p * log2(p) for p in prob) 
    
# 計算特徵 X、Y 間的 Mutual Information
def mutual_information(df,X, Y):
    # 計算 X 和 Y 的熵
    H_X = entropy(df,X)
    H_Y = entropy(df,Y)
    # 計算 X 和 Y 的聯合機率
    joint_counts = Counter((row[X], row[Y]) for row in df)  # 計算 X、Y 一起出現的次數
    total = len(df)
    joint_prob = [count / total for key,count in joint_counts.items()]
    H_X_Y = -sum(p * log2(p) for p in joint_prob)
    return H_X + H_Y - H_X_Y
    
# 計算特徵 X、Y 的 symmetric uncertainty
def cal_su(df,X,Y):
    H_X = entropy(df,X)
    H_Y = entropy(df,Y)
    return 2 * (mutual_information(df,X,Y) / (H_X + H_Y))

# 計算選取的特徵子集對於類別預測的 Goodness
def Goodness(df,feature_subset,label):
    su_X_C = 0
    sum_su_X_Y = 0  
    # 計算 feature_subset 內所有特徵對於類別值的 Symmetric uncertainty
    su_X_C = sum(cal_su(df,X,label) for X in feature_subset)
    
    # 計算 feature_subset 內所有兩兩特徵間的 Symmetric uncertainty
    for feature_i in feature_subset:
        for feature_j in feature_subset:
            sum_su_X_Y += cal_su(df,feature_i,feature_j)
    return su_X_C / sqrt(sum_su_X_Y)


In [26]:
# 定義資料集 attribute 欄位
X = columns.copy()
X.remove('class')
# 定義類別欄位
Y = 'class'

### forward selection

In [None]:
# X 代表資料集的特徵集合，y 則是類別值
# forward_selection 函式會決定最後 Goodness 最優的 feature subset

def forward_selection(df, X, y):
    select_features = []    # 儲存每回最優的 feature subset
    best_score = 0.0        # 每一列 feature subset 中最優的 Goodness
    remaining_features = X.copy()  # 還未被選定的 features
    # 持續檢查直到沒有可以選擇的 feature
    i = 1   # 紀錄 foward selection 到第幾輪 
    while(len(remaining_features) > 0):
        scores = []  # 儲存這一列中每個 subset 的 Goodness
        for feature in remaining_features:
            # temp_features 暫存此次循環的特徵組合 => 上回以選取好的最佳組合 select_features + 這回新選入的一個 feature
            temp_features = select_features + [feature]
            score = Goodness(df,temp_features,y)
            # (目前的特徵組合, 新選進來的特徵, 此特徵組合的 Goodness)
            scores.append((temp_features,feature,score))

        # 依照 Goodness 排序
        scores.sort(key=lambda x: x[2], reverse = True)  
        best_new_score = float(scores[0][2])   # 這一輪特徵組合中最優的 Goodness

        if(best_new_score > best_score):
            best_score = best_new_score
            select_features = scores[0][0]  # 更新成 Goodness 最優的 subset
            if scores[0][1] in remaining_features:
                remaining_features.remove(scores[0][1])  # 移除新選特徵
            print(f"Pass{i}: best_feature_subset = {select_features} , Goodness = {best_score}")
            i += 1
        # 此輪中所有 feature_subset 的表現皆不如上一輪，Stop
        else:
            break
    print(f"\nFinal select features: {select_features}, Goodness = {best_score}")
    return select_features,best_score

In [28]:
forward_selection(df,X,Y)

Pass1: best_feature_subset = ['inv_nodes'] , Goodness = 0.07664021945356746
Pass2: best_feature_subset = ['inv_nodes', 'deg_malig'] , Goodness = 0.10145042616557834
Pass3: best_feature_subset = ['inv_nodes', 'deg_malig', 'node_caps'] , Goodness = 0.10948734827994715
Pass4: best_feature_subset = ['inv_nodes', 'deg_malig', 'node_caps', 'irradiat'] , Goodness = 0.11169589476594623
Pass5: best_feature_subset = ['inv_nodes', 'deg_malig', 'node_caps', 'irradiat', 'tumor_size'] , Goodness = 0.11218633643681937

Final select features: ['inv_nodes', 'deg_malig', 'node_caps', 'irradiat', 'tumor_size'], Goodness = 0.11218633643681937


(['inv_nodes', 'deg_malig', 'node_caps', 'irradiat', 'tumor_size'],
 0.11218633643681937)

### backward_selection

In [29]:
def backward_selection(df, X, y):
    select_features = X     # 一開始是選擇所有 attributes  
    best_score = 0.0        # 每一列 feature subset 中最優的 Goodness
    i = 1
    # 持續檢查到選擇的 feature 只剩下一個
    while(len(select_features) > 1):
        scores = []  # 儲存這一列中每個 subset 的 Goodness
        for feature in select_features:
            temp_features = select_features.copy()
            # 每次移除一個 feature
            temp_features.remove(feature)
            score = Goodness(df,temp_features,y)
            # (目前的特徵組合, 移除的特徵, 此特徵組合的 Goodness)
            scores.append((temp_features,feature,score))

        scores.sort(key = lambda x: x[2], reverse = True)  
        best_new_score = float(scores[0][2])   # 這一輪特徵組合中最優的 Goodness
        # 代表移除該特徵後 Goodness 更好
        if(best_new_score > best_score):
            best_score = best_new_score
            select_features = scores[0][0]  # 更新成 Goodness 最優的 subset
            if scores[0][1] in select_features:
                select_features.remove(scores[0][1])  # 移除特徵
            print(f"Pass{i}: best_feature_subset = {select_features} , Goodness = {best_score}")
            print(f"\nremove feature: {scores[0][1]}\n")
            i += 1
        # 此輪中所有 feature_subset 的表現皆不如上一輪，Stop
        else:
            break
    print(f"\nFinal select features: {select_features}, Goodness = {best_score}")
    return select_features,best_score

In [30]:
backward_selection(df,X,Y)

Pass1: best_feature_subset = ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast_quad', 'irradiat'] , Goodness = 0.09719419884125785

remove feature: breast

Pass2: best_feature_subset = ['age', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast_quad', 'irradiat'] , Goodness = 0.10245371563812918

remove feature: menopause

Pass3: best_feature_subset = ['age', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'irradiat'] , Goodness = 0.10821070155745492

remove feature: breast_quad

Pass4: best_feature_subset = ['tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'irradiat'] , Goodness = 0.11218633643681936

remove feature: age


Final select features: ['tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'irradiat'], Goodness = 0.11218633643681936


(['tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'irradiat'],
 0.11218633643681936)