# NYCU Machine learning HW2 : Linear Discriminant Analysis 
## Written By 313511068 練鈞揚

In [85]:
import numpy as np
import pandas as pd

In [86]:
LABEL = ["Setosa" , "Versicolor" , "Virginica" ]
COLUMN_NAME = ["Sepal length", "Sepal width" , "Petal length" , "Petal width" , "Label"]

In [87]:
def load_iris_file(with_name:bool=False)->pd.DataFrame:
    df = pd.read_fwf("./iris.txt")
    
    df_new = pd.DataFrame({k:[v] for k ,v in zip(COLUMN_NAME , df.columns)},dtype=float)
    df.columns = COLUMN_NAME
    df_new = pd.concat([df_new, df],axis=0).reset_index().drop(columns=["index"])
    
    if not with_name:
        return df_new
    
    df_with_name = df_new.copy()
    
    df_with_name["Label"] = df_with_name["Label"].apply(lambda x : LABEL[int(x)-1])
    
    return df_with_name

In [88]:
df = load_iris_file(with_name=True)
df

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Label
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [89]:
POSITIVE_CLASS ,NEGATIVE_CLASS= "Versicolor" , "Virginica"

## Part 2

In [90]:
def build_dataset(df_in:pd.DataFrame)-> dict:
    df = df_in.drop(columns=["Sepal length",  "Sepal width"])
    positive_training_data = df[df["Label"]==POSITIVE_CLASS].copy()
    negative_training_data = df[df["Label"]==NEGATIVE_CLASS].copy()
    
    before_cross_train = [positive_training_data[:25], negative_training_data[:25]]
    before_cross_test = [positive_training_data[25:] , negative_training_data[25:]]

    def merge_dataset(df_list:list[pd.DataFrame]):
        return pd.concat(df_list).reset_index().drop(columns=["index"])
    
    before_cross_dataset = {
        "train":merge_dataset(before_cross_train),
        "test":merge_dataset(before_cross_test),
    }
    
    after_cross_dataset = {
        "train":merge_dataset(before_cross_test),
        "test":merge_dataset(before_cross_train),
    }
        
    return {
        "before": before_cross_dataset , 
        "after":after_cross_dataset
    }

In [91]:
dataset = build_dataset(df)
dataset

{'before': {'train':     Petal length  Petal width       Label
  0            4.7          1.4  Versicolor
  1            4.5          1.5  Versicolor
  2            4.9          1.5  Versicolor
  3            4.0          1.3  Versicolor
  4            4.6          1.5  Versicolor
  5            4.5          1.3  Versicolor
  6            4.7          1.6  Versicolor
  7            3.3          1.0  Versicolor
  8            4.6          1.3  Versicolor
  9            3.9          1.4  Versicolor
  10           3.5          1.0  Versicolor
  11           4.2          1.5  Versicolor
  12           4.0          1.0  Versicolor
  13           4.7          1.4  Versicolor
  14           3.6          1.3  Versicolor
  15           4.4          1.4  Versicolor
  16           4.5          1.5  Versicolor
  17           4.1          1.0  Versicolor
  18           4.5          1.5  Versicolor
  19           3.9          1.1  Versicolor
  20           4.8          1.8  Versicolor
  21         

In [92]:
# check the data
assert all(dataset["before"]["train"] == dataset["after"]["test"])
assert all(dataset["before"]["test"] == dataset["after"]["train"])

In [93]:
dataset["before"]["train"]

Unnamed: 0,Petal length,Petal width,Label
0,4.7,1.4,Versicolor
1,4.5,1.5,Versicolor
2,4.9,1.5,Versicolor
3,4.0,1.3,Versicolor
4,4.6,1.5,Versicolor
5,4.5,1.3,Versicolor
6,4.7,1.6,Versicolor
7,3.3,1.0,Versicolor
8,4.6,1.3,Versicolor
9,3.9,1.4,Versicolor


In [121]:
class Lda:
    DISPLAY_CLASS = [POSITIVE_CLASS , NEGATIVE_CLASS]
    
    def __init__(self):
        self._w :np.ndarray = None
        self._cov_matrix :np.ndarray= None
        self._b = None
        return
    
    @property
    def w(self)->np.ndarray:
        return self._w
    
    @property
    def b(self):
        return self._b
    
    @property
    def cov_matrix(self)->np.ndarray:
        return self._cov_matrix
    
    @staticmethod
    def _build_mean_and_cov(data_in: np.ndarray):
        mean  = np.mean(data_in , axis=0)
        cov = np.cov(data_in.T)
        return mean , cov
    
    def fit(self , data_in:pd.DataFrame , c1:int=1, c2:int=1)->None:
        positive_data = data_in[data_in["Label"] == POSITIVE_CLASS]
        negative_data = data_in[data_in["Label"] == NEGATIVE_CLASS]
        
        positive_data = positive_data.drop(columns=["Label"]).to_numpy()
        negative_data = negative_data.drop(columns=["Label"]).to_numpy()
        
        positive_mean , positive_cov = Lda._build_mean_and_cov(positive_data)
        negative_mean , negative_cov = Lda._build_mean_and_cov(negative_data)
        
        positive_len , negative_len = len(positive_data) , len(negative_data)
        
        total = positive_len + negative_len
        p1 , p2 = positive_len / total, negative_len / total
        
        # cov matrix
        self._cov_matrix = p1 * positive_cov + p2 * negative_cov
        
        inv_cov = np.linalg.pinv(self._cov_matrix)
        
        # weight
        self._w = (positive_mean - negative_mean).T @ inv_cov
        
        # b
        self._b = -(1/2)*(positive_mean - negative_mean).T @ inv_cov @ (positive_mean + negative_mean) - np.log((c1*p2 )/ (c2*p1))
        
        
        return  
    
    def predict(self, x:np.ndarray):
        return self._w.T @ x + self._b
    
    def predict_with_class(self, x:np.ndarray):
        return Lda.DISPLAY_CLASS[float(self.predict(x)) > 0]
    
    def predict_with_df(self, x_df:pd.DataFrame)->np.ndarray:
        np_array = x_df.drop(columns=["Label"]).to_numpy().T
        result = lda_model.predict(np_array)
        arr_item = np.array([Lda.DISPLAY_CLASS[int(item)] for item in result < 0 ])
        return arr_item
    
    def acc(self , x_df: pd.DataFrame)-> float:
        
        predict_out = self.predict_with_df(x_df)
        true_label = x_df["Label"]
        
        
        return float(sum(predict_out == true_label.to_numpy()) / len(true_label))
    
    def __call__(self,x:np.ndarray) -> np.ndarray:
        return self.predict(x)
    
    def __str__(self):
        if self._w is None or self._b is None or self._cov_matrix is None:
            return "Model is not fitted"
        
        return f"W : {self._w}\nB : {self._b:.2f}\nCov:{self._cov_matrix}"
        

In [122]:
lda_model = Lda()

In [123]:
lda_model.fit(dataset["before"]["train"])

In [124]:
print(lda_model)

W : [ -2.0860987  -10.46103593]
B : 28.10
Cov:[[0.30721667 0.06568333]
 [0.06568333 0.05381667]]


In [127]:
before_acc = lda_model.acc(dataset["before"]["test"])
before_acc

0.94

In [128]:
lda_model = Lda()

In [129]:
lda_model.fit(dataset["after"]["train"])

In [130]:
print(lda_model)

W : [-3.73417218 -7.85309846]
B : 31.08
Cov:[[0.21825    0.05615833]
 [0.05615833 0.06243333]]


In [134]:
after_acc = lda_model.acc(dataset["after"]["test"])
after_acc

0.94

In [136]:
avg_acc = (before_acc + after_acc) / 2
print(f"Avg acc : {avg_acc * 100 : .2f}")

Avg acc :  94.00
