# NYCU Machine Learning 2024 : HW5 Sequential Forward Selection  and  Fisher’s Criterion

In [1]:
import numpy as np
import pandas as pd
import json
from dataclasses import dataclass
from pathlib import Path
from sklearn import datasets
from sklearn.utils import  Bunch
from sklearn.metrics import confusion_matrix
from dataclasses import dataclass, field
from lib import Lda
from rich import print
from tqdm import tqdm
from typing import Any, Callable

## Load data

In [2]:
data = datasets.load_breast_cancer()
list(data.keys())

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename',
 'data_module']

In [3]:
# print(data["DESCR"])

In [4]:
# def data_look_like()_

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df['target'] = df['target'].map({0: 'malignant', 1: 'benign'})
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,malignant
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,malignant
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,malignant
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,malignant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,malignant
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,malignant
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,malignant
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,malignant


In [5]:
POSITIVE_LABEL = "malignant"
NEGATIVE_LABEL = "benign"


In [6]:
output_folder = Path("output")
output_folder.mkdir(parents=True, exist_ok=True)

In [7]:
@dataclass
class FeatureSelectionDataset:
    data : dict[str, np.ndarray]
    TEST_INV_2_FOLD = {"before": "after", "after": "before"}
    
    def train_data(self, key:str) -> pd.DataFrame:
        return self.data[key]

    def test_data(self, key:str) -> pd.DataFrame:
        return self.data[self.TEST_INV_2_FOLD[key]]

    @staticmethod
    def _preprocess_data(data_in:pd.DataFrame, column_name:list[str], target_label:str) -> tuple[np.ndarray, np.ndarray]:
        
        input_data = data_in[column_name].to_numpy()
        target = data_in[target_label].to_numpy()
        
        return input_data, target

    @classmethod
    def build_dataset_for_2_fold(cls, df_in: pd.DataFrame, column_name:list[str], target_label:str = "target") :
        n = len(df_in)
        n_test = int(n * 0.5)
        n_train = n - n_test
        
        before_data , after_data = df_in[:n_train], df_in[n_train:]
        before_data = before_data[column_name + [target_label]]
        after_data = after_data[column_name + [target_label]]
        
        return cls(
            data = {
                "before": before_data,
                "after": after_data,
            }
        )
        
        

In [8]:
@dataclass
class FeatureSelectionResult:
    selection_method_name:str
    best_result: dict
    run_log: list[dict] = field(repr=False)
    other : dict | None = None
    
    @classmethod
    def build_result(cls,name:str, run_result:list[tuple], key:Callable = lambda x: x[2], other:Any = None) :
        format_like = ["length", "features", "balanced_cr", "Avg acc"]
        
        run_log = [
            dict(zip(format_like,item)) 
            for item in run_result
        ]
        best_result = max(run_result, key=key)
        
        best_result = dict(zip(format_like, best_result))
        
        other = other
        
        return cls(name, best_result, run_log, other)
    
    def save_to_json(self, folder:Path) -> None:
        folder = folder / f"{self.selection_method_name}.json"
        with open(folder, 'w') as f:
            json.dump(self.__dict__, f, indent=4)

In [9]:
class FeatureSelection:
    
    def __init__(self, data:Bunch, model_config:dict, target_name:str="target"):
        self.data = data
        self.data_feature_names = data.feature_names
        self.target_name = target_name
        
        self.df = FeatureSelection.build_df_from_data(self.data, self.target_name)
        self.model_config = model_config
        
        self.build_dataset = lambda column_name: FeatureSelectionDataset.build_dataset_for_2_fold(self.df, column_name, self.target_name)
        
    @staticmethod
    def build_df_from_data(data_in: Bunch, target_name:str):
        df = pd.DataFrame(data_in.data, columns=data_in.feature_names)
        df[target_name] = data_in.target
        df[target_name] = df[target_name].map({0: 'malignant', 1: 'benign'})
        return df
    
    @staticmethod
    def balance_cr(y_true:np.ndarray, y_pred:np.ndarray)-> float:
        # Compute the confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()

        balance_cr = (tp / (tp + fn) +  tn / (tn + fp)) / 2          
        return balance_cr
    
    def balance_with_2_fold(self, dataset:FeatureSelectionDataset)-> tuple[float, float]:
        model = Lda(**self.model_config)
        states = ["before", "after"]
        balance, acc = [], []
        
        for state in states:
            train = dataset.train_data(state)
            test = dataset.test_data(state)
            y_test = test[[self.target_name]].copy()
            
            model.fit(data_in=train, column_name=self.target_name)
            y_pred = model.predict_with_df(test, column_name=self.target_name)
            
            balance.append(FeatureSelection.balance_cr(y_test, y_pred))
            acc.append(model.acc(test, self.target_name))
        
        return np.mean(balance), np.mean(acc)
    
    def sequential_forward_selection_term(self,features_pocket:set[str], using_features:list[str]):
        
        feature_with_score = []
            
        for feature_name in features_pocket:
            build_data_column = using_features + [feature_name]
            
            dataset = self.build_dataset(build_data_column)
            balance, acc = self.balance_with_2_fold(dataset)
        
            feature_with_score.append((build_data_column, balance, acc))
            
        max_features, max_score, avg_acc = max(feature_with_score, key=lambda x:x[1])
        
        return {"new_feature":max_features[-1], "max_features":max_features, "max_score":max_score, "avg_acc":avg_acc}
    
    def sequential_forward_selection(self):
        """
        Sequential Forward Selection (SFS)
        """
        results = []
        
        features_pocket = set(map(str, self.data_feature_names))
        using_features = []
        for i in tqdm(range(len(self.data_feature_names)), desc="SFS", unit="feature"):
            
            term_result = self.sequential_forward_selection_term(features_pocket, using_features)
            
            # add result
            results.append((i+1, term_result["max_features"], term_result["max_score"], term_result["avg_acc"]))
            
            # update
            features_pocket.remove(term_result["new_feature"])
            using_features.append(term_result["new_feature"])
       
        return FeatureSelectionResult.build_result(
            name=FeatureSelection.sequential_forward_selection.__name__,
            run_result=results, 
            key=lambda x:x[2],
        )
        
        
    def fisher_criterion_score(self, feature_name:str, target_label:str)-> float:
        
        data_overall = self.df[[feature_name]].mean()
        target_classes = df[target_label].unique()
        sb , sw = 0, 0
        
        for class_ in target_classes:
            data_class = df[df[target_label] == class_][feature_name]
            data_class_mean = data_class.mean()
            data_class_std = data_class.std(ddof=0)
            
            sb += len(data_class) * (data_class_mean - data_overall) ** 2
            sw += (len(data_class) / len(self.df)) *  data_class_std ** 2 
        
        fisher_score = sb / sw if sw != 0 else 0
        
        return float(fisher_score.iloc[0])
        
    def fisher_criterion(self):
        
        # find the f score of each feature
        fisher_scores = []
        for feature_name in tqdm(self.data_feature_names, desc="Fisher Score", unit="feature"):
            fisher_score = self.fisher_criterion_score(feature_name, self.target_name)
            fisher_scores.append((str(feature_name), fisher_score))
            
        # sort the feature by f score
        fisher_scores = sorted(fisher_scores, key=lambda x:x[1], reverse=True)
        fisher_scores_dict = dict(fisher_scores)
        
        ordered_features = list(fisher_scores_dict.keys())
        
        results = []
        # select the top k features
        for i in tqdm(range(len(self.data_feature_names)), desc="Fisher Score top k", unit="features"):
        
            # build dataset with the top k features
            feature_names = ordered_features[:i+1]
            dataset = self.build_dataset(feature_names)
            balance, acc = self.balance_with_2_fold(dataset)
            results.append((i+1, feature_names, balance, acc))
        
        return FeatureSelectionResult.build_result(
            name=FeatureSelection.fisher_criterion.__name__,
            run_result=results, 
            key=lambda x:x[2], 
            other={"fisher_scores": fisher_scores_dict}
        )
       
    

In [10]:
selector = FeatureSelection(
    data=data, 
    model_config={
        "positive_class": POSITIVE_LABEL, 
        "negative_class": NEGATIVE_LABEL,
        "c1" : 1,
        "c2" : 1
    }
)

In [11]:
result = selector.sequential_forward_selection()

SFS: 100%|██████████| 30/30 [00:05<00:00,  5.95feature/s]


In [12]:
print(result)

In [13]:
result.save_to_json(output_folder)

In [14]:
result = selector.fisher_criterion()

Fisher Score: 100%|██████████| 30/30 [00:00<00:00, 391.80feature/s]
Fisher Score top k:   0%|          | 0/30 [00:00<?, ?features/s]

Fisher Score top k: 100%|██████████| 30/30 [00:00<00:00, 93.49features/s]


In [15]:
print(result)

In [16]:
result.save_to_json(output_folder)