# NYCU Machine Learning 2024 : HW4 SVM


In [25]:
from itertools import combinations
from dataclasses import dataclass, field

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from lib import SupportVectorMachine, IrisDataset
from rich import print

In [26]:
np.set_printoptions(precision=4)

In [27]:
df = IrisDataset.load_iris_file(with_name=True)
df

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Label
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [28]:
@dataclass
class SvmModelModule:
    pair: tuple[str,str]
    model: SupportVectorMachine
    positive_class:str
    negative_class:str
    
    # for mapping
    label_to_index_np : np.vectorize = field(repr=False)
    index_to_label_np : np.vectorize = field(repr=False)
    
    @classmethod
    def build(cls, pair:tuple[str,str], C:int, kernel_name: str , kernel_arg: dict , threshold: float):
        positive_class, negative_class = pair
        model = SupportVectorMachine(
            C=C , 
            kernel_name=kernel_name,
            kernel_arg=kernel_arg,
            threshold=threshold,
        )
        
        label_to_index = {positive_class : 1 , negative_class : -1}
        index_to_label = {1 : positive_class , -1 : negative_class}


        label_to_index_np = np.vectorize(label_to_index.get)
        index_to_label_np = np.vectorize(index_to_label.get)
        
        return cls(
            pair, 
            model, 
            positive_class, 
            negative_class, 
            label_to_index_np, 
            index_to_label_np
        )
        
    def build_train_test_dataset(self, df_in:pd.DataFrame, train_size:int=IrisDataset.TRAIN_DATA_SIZE):
        # have before and after -> for two-fold 
        positive_data = df_in[df_in["Label"] == self.positive_class]
        negative_data = df_in[df_in["Label"] == self.negative_class]
        
        before = [positive_data[:train_size], negative_data[:train_size]]
        after  = [positive_data[train_size:], negative_data[train_size:]]
        
        return {
            "before": {
                "train": pd.concat(before),
                "test" : pd.concat(after),
            },
            "after" : {
                "train" : pd.concat(after),
                "test"  : pd.concat(before),
            }
        }
    
    
    def build_for_model_input(self, df_in:pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
        in_x , in_y = df_in.drop(columns=["Label"]).to_numpy() , df_in["Label"].to_numpy() 
        
        in_y = self.label_to_index_np(in_y)
        return in_x , in_y
    
    # need to cut the data
    def train(self , df_in :pd.DataFrame):
        x,y = self.build_for_model_input(df_in=df_in)
        
        self.model.train(x, y)
        return 
    
    # for the analysis
    def acc(self, df_in :pd.DataFrame)->float:
        x, y = self.build_for_model_input(df_in=df_in)
        acc = self.model.acc(x, y)
        return acc
    
    def predict(self, x:np.ndarray)->np.ndarray:
        res = self.model(x,with_sign=True)
        
        res_label = self.index_to_label_np(res)
        
        return res_label
    
    

In [67]:

class MultiSupportVectorMachine:
    
    STATES = ["before", "after"]
    
    def __init__(self,class_names:list[str], C: int, kernel_name: str = "rbf", kernel_arg: dict = dict(), threshold: float = 1e-20):
        self._C = C
        self._kernel_name = kernel_name  
        self._kernel_arg = kernel_arg
        self._threshold = threshold
        self._class_names = class_names
        
        class_combination = list(combinations(class_names, 2))
        
        # build model
        self._models = {
            pair: SvmModelModule.build(
                pair=pair,
                C=self._C , 
                kernel_name=self._kernel_name,
                kernel_arg=self._kernel_arg,
                threshold=self._threshold,
            ) 
            for pair in class_combination
        }
        
        return 
    
    def __repr__(self):
        return f"class name: {self._class_names}, C: {self._C}, kernel: {self._kernel_name} ({self._kernel_arg})"
    
    def train(self, data_in : dict[tuple[str,str], pd.DataFrame])->dict:
        acc_dict = dict()
        
        for pair, training_data in data_in.items():
            self._models[pair].train(training_data)
            
            acc_dict[pair] = self._models[pair].acc(training_data)
        
        
        return acc_dict
    
    def _get_most_freq_by_row(self,row:np.ndarray):
        unique , counts = np.unique(row , return_counts=True)
        return unique[np.argmax(counts)]
    
    def predict(self, x:np.ndarray)->np.ndarray:
        
        res = [model.predict(x) for model in self._models.values()]
        res_np = np.array(res).T
        
        res = np.array([self._get_most_freq_by_row(row) for row in res_np])
        return res
    
    def acc(self, df_in: pd.DataFrame)-> tuple[float, np.ndarray]:
        x , y = df_in.drop(columns=["Label"]).to_numpy() , df_in["Label"].to_numpy() 
        res = self.predict(x)
        
        return np.mean(res ==y), res
    
    
    def build_dataset(self, df_in: pd.DataFrame) -> dict:
        dataset = {
            "before" : {"train" : dict(), "test":[]},
            "after" : {"train" : dict(), "test":[]}
        }
        
        # build dataset
        for pair in self._models.keys():
            part_of_dataset = self._models[pair].build_train_test_dataset(df_in)
            
            for state in MultiSupportVectorMachine.STATES:
                dataset[state]["train"][pair] = part_of_dataset[state]["train"]
                dataset[state]["test"].append(part_of_dataset[state]["test"])
        
        # merge to same dataset "test"
        for state in MultiSupportVectorMachine.STATES:
            dataset[state]["test"] = pd.concat(dataset[state]["test"])
            
        return dataset

In [68]:
@dataclass
class TestResult:
    content:dict = field(repr=False)
    model: MultiSupportVectorMachine
    C : int
    sigma : float
    before_acc : float
    after_acc : float
    avg_acc : float
    
    @classmethod
    def build(cls, model:MultiSupportVectorMachine, logs:dict):
        return cls(
            logs, 
            model, 
            model._C, 
            model._kernel_arg["sigma"],
            logs["before"]["acc"],
            logs["after"]["acc"],
            logs["avg_acc"],
        )

In [69]:
def two_fold_val(model:MultiSupportVectorMachine, df_in:pd.DataFrame):
    dataset = model.build_dataset(df_in)
    result_log = dict()
    
    state_acc = []
    
    for state in MultiSupportVectorMachine.STATES:
        # train
        train_data, test_data = dataset[state]["train"], dataset[state]["test"]
        each_model_training_acc = model.train(train_data)
        
        # test
        acc, predict = model.acc(test_data)
        
        result_log[state] = {
            "each_model_training_acc" : each_model_training_acc,
            "acc" : acc,
            "predict" : predict,
        }
        
        state_acc.append(acc)
    
    result_log["avg_acc"] = np.mean(state_acc)
        
    return TestResult.build(model, result_log) 

In [73]:
model = MultiSupportVectorMachine(
    class_names=IrisDataset.LABEL,
    C=10,
    kernel_name="rbf",
    kernel_arg={"sigma":1},
)

In [74]:
res = two_fold_val(model, df)

In [75]:
print(res)