# NYCU Machine learning HW2 : Linear Discriminant Analysis 
## Written By 313511068 練鈞揚

In [1]:
import numpy as np
import pandas as pd

from itertools import combinations
from rich import print

In [2]:
# setting
np.set_printoptions(precision=2, suppress=True)

In [3]:
LABEL = ["Setosa" , "Versicolor" , "Virginica" ]
COLUMN_NAME = ["Sepal length", "Sepal width" , "Petal length" , "Petal width" , "Label"]
TRAIN_DATA_SIZE = 25

In [4]:
def load_iris_file(with_name:bool=False)->pd.DataFrame:
    df = pd.read_fwf("./iris.txt")
    
    df_new = pd.DataFrame({k:[v] for k ,v in zip(COLUMN_NAME , df.columns)},dtype=float)
    df.columns = COLUMN_NAME
    df_new = pd.concat([df_new, df],axis=0).reset_index().drop(columns=["index"])
    
    if not with_name:
        return df_new
    
    df_with_name = df_new.copy()
    
    df_with_name["Label"] = df_with_name["Label"].apply(lambda x : LABEL[int(x)-1])
    
    return df_with_name

In [5]:
df = load_iris_file(with_name=True)
df

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Label
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [6]:
POSITIVE_CLASS ,NEGATIVE_CLASS= "Versicolor" , "Virginica"

## Part 2

In [7]:
def build_dataset(df_in:pd.DataFrame)-> dict:
    df = df_in.drop(columns=["Sepal length",  "Sepal width"])
    positive_training_data = df[df["Label"]==POSITIVE_CLASS].copy()
    negative_training_data = df[df["Label"]==NEGATIVE_CLASS].copy()
    
    before_cross_train = [positive_training_data[:TRAIN_DATA_SIZE], negative_training_data[:TRAIN_DATA_SIZE]]
    before_cross_test = [positive_training_data[TRAIN_DATA_SIZE:] , negative_training_data[TRAIN_DATA_SIZE:]]

    def merge_dataset(df_list:list[pd.DataFrame]):
        return pd.concat(df_list).reset_index().drop(columns=["index"])
    
    before_cross_dataset = {
        "train":merge_dataset(before_cross_train),
        "test":merge_dataset(before_cross_test),
    }
    
    after_cross_dataset = {
        "train":merge_dataset(before_cross_test),
        "test":merge_dataset(before_cross_train),
    }
        
    return {
        "before": before_cross_dataset , 
        "after":after_cross_dataset
    }

In [8]:
dataset = build_dataset(df)
dataset

{'before': {'train':     Petal length  Petal width       Label
  0            4.7          1.4  Versicolor
  1            4.5          1.5  Versicolor
  2            4.9          1.5  Versicolor
  3            4.0          1.3  Versicolor
  4            4.6          1.5  Versicolor
  5            4.5          1.3  Versicolor
  6            4.7          1.6  Versicolor
  7            3.3          1.0  Versicolor
  8            4.6          1.3  Versicolor
  9            3.9          1.4  Versicolor
  10           3.5          1.0  Versicolor
  11           4.2          1.5  Versicolor
  12           4.0          1.0  Versicolor
  13           4.7          1.4  Versicolor
  14           3.6          1.3  Versicolor
  15           4.4          1.4  Versicolor
  16           4.5          1.5  Versicolor
  17           4.1          1.0  Versicolor
  18           4.5          1.5  Versicolor
  19           3.9          1.1  Versicolor
  20           4.8          1.8  Versicolor
  21         

In [9]:
# check the data
assert all(dataset["before"]["train"] == dataset["after"]["test"])
assert all(dataset["before"]["test"] == dataset["after"]["train"])

In [76]:
class Lda:
    # DISPLAY_CLASS = [POSITIVE_CLASS , NEGATIVE_CLASS]
    
    def __init__(self , positive_class:str , negative_class:str):
        self._w :np.ndarray = None
        self._cov_matrix :np.ndarray= None
        self._b = None
        
        self._positive_class :str = positive_class 
        self._negative_class :str = negative_class
        self._display_class = [self._negative_class ,self._positive_class]
        return
    
    @property
    def w(self)->np.ndarray:
        return self._w
    
    @property
    def b(self):
        return self._b
    
    @property
    def cov_matrix(self)->np.ndarray:
        return self._cov_matrix
    
    @staticmethod
    def _build_mean_and_cov(data_in: np.ndarray):
        mean  = np.mean(data_in , axis=0)
        cov = np.cov(data_in.T)
        return mean , cov
    
    def fit(self , data_in:pd.DataFrame , c1:int=1, c2:int=1)->None:
        positive_data = data_in[data_in["Label"] == self._positive_class]
        negative_data = data_in[data_in["Label"] == self._negative_class]
        
        positive_data = positive_data.drop(columns=["Label"]).to_numpy()
        negative_data = negative_data.drop(columns=["Label"]).to_numpy()
        
        positive_mean , positive_cov = Lda._build_mean_and_cov(positive_data)
        negative_mean , negative_cov = Lda._build_mean_and_cov(negative_data)
        
        positive_len , negative_len = len(positive_data) , len(negative_data)
        
        total = positive_len + negative_len
        p1 , p2 = positive_len / total, negative_len / total
        
        # cov matrix
        self._cov_matrix = p1 * positive_cov + p2 * negative_cov
        
        inv_cov = np.linalg.pinv(self._cov_matrix)
        
        # weight
        self._w = (positive_mean - negative_mean).T @ inv_cov
        
        # b
        self._b = -(1/2)*(positive_mean - negative_mean).T @ inv_cov @ (positive_mean + negative_mean) - np.log((c1*p2)/ (c2*p1))
        
        return  
    
    def predict(self, x:np.ndarray):
        return self._w.T @ x + self._b
    
    def predict_with_df(self, x_df:pd.DataFrame)->np.ndarray:
        np_array = x_df.drop(columns=["Label"]).to_numpy().T
        result = self.predict(np_array)
        arr_item = np.array([self._display_class[int(item)] for item in result > 0])
        return arr_item
    
    def acc(self , x_df: pd.DataFrame)-> float:
        
        predict_out = self.predict_with_df(x_df)
        true_label = x_df["Label"]
        
        return float(sum(predict_out == true_label.to_numpy()) / len(true_label))
    
    def __call__(self,x:np.ndarray) -> np.ndarray:
        return self.predict(x)
    
    def __repr__(self) -> str:
        if self._w is None or self._b is None or self._cov_matrix is None:
            return "Model is not fitted"
        
        return f"Pos:{self._positive_class}, Neg: {self._negative_class}, W : {self._w} B : {self._b:.2f} Cov:{self._cov_matrix}"
    
    def __str__(self) -> str:
        if self._w is None or self._b is None or self._cov_matrix is None:
            return "Model is not fitted"
        
        return f"Pos:{self._positive_class}, Neg: {self._negative_class}\nW : {self._w}\nB : {self._b:.2f}\nCov:\n{self._cov_matrix}"
        

In [95]:
def test_model(model_in , dataset_in:dict):
    
    def run_by_dataset(dataset_choose:str):
        model_in.fit(dataset_in[dataset_choose]["train"])
        out_str = f'{dataset_choose}:\n{str(model_in)}\n'
        
        acc = model_in.acc(dataset_in[dataset_choose]["test"])
        out_str += f"Acc :{acc*100:.2f}%\n\n"
        return out_str , acc
    
    before_out , before_acc = run_by_dataset("before")
    after_out , after_acc = run_by_dataset("after")
    
    out_res_str = before_out + after_out
    avg_acc = (before_acc + after_acc) / 2
    
    out_res_str += f"Avg acc :{avg_acc*100:.2f}%"
    
    return out_res_str

In [96]:
lda_model = Lda(positive_class=POSITIVE_CLASS , negative_class=NEGATIVE_CLASS)

In [97]:
test_model_detail = test_model(model_in=lda_model,dataset_in=dataset)
print(test_model_detail)

## Part 3

In [98]:
def build_multi_dataset(df_in:pd.DataFrame , class_list:list[str])-> dict:
    df = df_in.drop(columns=["Sepal length",  "Sepal width"])
    
    train_data_lst = [df[df["Label"]==class_name].copy() for class_name in class_list]
    
    before_cross_train = [item_df[:TRAIN_DATA_SIZE] for item_df in train_data_lst]
    before_cross_test = [item_df[TRAIN_DATA_SIZE:] for item_df in train_data_lst]

    def merge_dataset(df_list:list[pd.DataFrame]):
        return pd.concat(df_list).reset_index().drop(columns=["index"])
    
    before_cross_train = merge_dataset(before_cross_train)
    before_cross_test = merge_dataset(before_cross_test)
    
    before_cross_dataset = {
        "train":before_cross_train,
        "test":before_cross_test,
    }
    
    after_cross_dataset = {
        "train":before_cross_test,
        "test":before_cross_train,
    }
        
    return {
        "before": before_cross_dataset , 
        "after":after_cross_dataset
    }

In [99]:
multi_dataset = build_multi_dataset(df , class_list=LABEL)
multi_dataset

{'before': {'train':     Petal length  Petal width      Label
  0            1.4          0.2     Setosa
  1            1.4          0.2     Setosa
  2            1.3          0.2     Setosa
  3            1.5          0.2     Setosa
  4            1.4          0.2     Setosa
  ..           ...          ...        ...
  70           5.7          2.3  Virginica
  71           4.9          2.0  Virginica
  72           6.7          2.0  Virginica
  73           4.9          1.8  Virginica
  74           5.7          2.1  Virginica
  
  [75 rows x 3 columns],
  'test':     Petal length  Petal width      Label
  0            1.6          0.2     Setosa
  1            1.6          0.4     Setosa
  2            1.5          0.2     Setosa
  3            1.4          0.2     Setosa
  4            1.6          0.2     Setosa
  ..           ...          ...        ...
  70           5.2          2.3  Virginica
  71           5.0          1.9  Virginica
  72           5.2          2.0  Virginica

In [100]:
# check the data
assert all(multi_dataset["before"]["train"] == multi_dataset["after"]["test"])
assert all(multi_dataset["before"]["test"] == multi_dataset["after"]["train"])

In [101]:
class LdaMultiple:
    def __init__(self , class_list:list[str]):
        self._class_list = class_list
        self._combination_list = list(combinations(class_list, 2))
        self._models = [Lda(pos,neg) for pos , neg  in self._combination_list]        
        return
     
    @property
    def models(self):
        return self._models
    
    def fit(self, x_df:pd.DataFrame):
        for i in range(len(self._models)):
            self._models[i].fit(x_df)
        return 
    
    def predict(self , x_df:pd.DataFrame):
        pred = [item_model.predict_with_df(x_df) for item_model in self._models]
        
        res_np = np.array(pred).T
        
        def get_most_freq_by_row(row):
            unique , counts = np.unique(row , return_counts=True)
            return unique[np.argmax(counts)]
        
        res = np.array([get_most_freq_by_row(row) for row in res_np])
        return res
    
    def acc(self, x_df: pd.DataFrame)-> float:
        pred = self.predict(x_df)
        
        true_label = x_df["Label"].to_numpy()
        return float(np.sum(pred == true_label) / len(true_label))
    
    def __str__(self):
        class_list_str = f"Class list: {', '.join(self._class_list)}"
        return class_list_str+ "\n" + "\n".join(map(str, self._models))

In [102]:
multi_model = LdaMultiple(LABEL)

In [103]:
test_model_detail = test_model(multi_model , multi_dataset)
print(test_model_detail)