# NYCU Machine Learning 2024
## Written By 313511068 練鈞揚

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

In [2]:
LABEL = ["Setosa" , "Versicolor" , "Virginica" ]
COLUMN_NAME = ["Sepal length", "Sepal width" , "Petal length" , "Petal width" , "Label"]

In [3]:
# reading file
df = pd.read_fwf("./iris.txt")
df

Unnamed: 0,5.1000000e+000,3.5000000e+000,1.4000000e+000,2.0000000e-001,1
0,4.9,3.0,1.4,0.2,1
1,4.7,3.2,1.3,0.2,1
2,4.6,3.1,1.5,0.2,1
3,5.0,3.6,1.4,0.2,1
4,5.4,3.9,1.7,0.4,1
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,3
145,6.3,2.5,5.0,1.9,3
146,6.5,3.0,5.2,2.0,3
147,6.2,3.4,5.4,2.3,3


In [4]:
df.columns

Index(['5.1000000e+000', '3.5000000e+000', '1.4000000e+000', '2.0000000e-001',
       '1'],
      dtype='object')

In [5]:
df_new = pd.DataFrame({k:[v] for k ,v in zip(COLUMN_NAME , df.columns)},dtype=float)
df.columns = COLUMN_NAME
df_new = pd.concat([df_new, df],axis=0).reset_index().drop(columns=["index"])
df_new

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Label
0,5.1,3.5,1.4,0.2,1.0
1,4.9,3.0,1.4,0.2,1.0
2,4.7,3.2,1.3,0.2,1.0
3,4.6,3.1,1.5,0.2,1.0
4,5.0,3.6,1.4,0.2,1.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,3.0
146,6.3,2.5,5.0,1.9,3.0
147,6.5,3.0,5.2,2.0,3.0
148,6.2,3.4,5.4,2.3,3.0


In [6]:
df_ori = df_new.copy()

In [7]:
df_new["Label"] = df_new["Label"].apply(lambda x : LABEL[int(x)-1] )
df_new

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Label
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [8]:
df_head_list = list(filter(lambda x : x!= "Label" , df.columns))
df_head_list

['Sepal length', 'Sepal width', 'Petal length', 'Petal width']

In [9]:
# get combination
plot_axis = [list(combinations(df_head_list , r)) for r in range(1,5)]
plot_axis

[[('Sepal length',), ('Sepal width',), ('Petal length',), ('Petal width',)],
 [('Sepal length', 'Sepal width'),
  ('Sepal length', 'Petal length'),
  ('Sepal length', 'Petal width'),
  ('Sepal width', 'Petal length'),
  ('Sepal width', 'Petal width'),
  ('Petal length', 'Petal width')],
 [('Sepal length', 'Sepal width', 'Petal length'),
  ('Sepal length', 'Sepal width', 'Petal width'),
  ('Sepal length', 'Petal length', 'Petal width'),
  ('Sepal width', 'Petal length', 'Petal width')],
 [('Sepal length', 'Sepal width', 'Petal length', 'Petal width')]]

In [10]:
sum([len(item) for item in plot_axis])

15

In [11]:
def build_nxm_combination(df_in:pd.DataFrame, xyz_s:list[list[str]] , row_in:int, col_in:int) -> go.Figure:
    
    titles = [
        f'{" x ".join(item)}' for item in xyz_s
    ]
    color_map = {group: color for group, color in zip(df_in['Label'].unique(), ['blue', 'red', 'green',])}
    
    dim = len(xyz_s[0])
    keys = ['x', 'y', 'z'][:dim]
    
    # specs = [[{'type': 'scatter'} for _ in range(col_in)] for _ in range(row_in)]
    addition = dict()
    if dim == 3:
        addition = dict(specs = [[{'type': 'scene'} for _ in range(col_in)] for _ in range(row_in)])

    # Initialize figure with subplots
    fig = make_subplots(
        rows=row_in, cols=col_in, 
        subplot_titles=titles,
        **addition,
    )
    
    for i , xyz in enumerate(xyz_s):
        row , col = divmod(i, col_in)
        
        for group in df_in['Label'].unique():    
            
            group_df = df_in[df_in['Label'] == group]
            
            in_dict = {key: group_df[xyz[i]] for i, key in enumerate(keys)}
        
            in_dict |= dict(
                mode='markers',
                name=group,
                marker=dict(size=10,color=color_map[group]),
            )
            
            scatter_func = "Scatter" if 'z' not in keys else "Scatter3d"
            
            fig.add_trace(
                getattr(go, scatter_func)(**in_dict),
                row=row+1,
                col=col+1,
            )
            
            if dim == 3:
               
                continue
                
            
            for i , name in enumerate(keys):    
                run_fun_name = f"update_{name}axes"
                fun = getattr(fig, run_fun_name)
                fun(title_text=xyz[i], row=row+1, col=col+1)
            
    update_dict = dict()
    if dim != 3:
        update_dict = {
            f"{key}axis" : dict(automargin=True) for key in keys
        } 
    

    # Update title and height
    fig.update_layout(
        title_text=f"All {dim}-D Plot",  height=700,
        **update_dict
    )
    
    
    return fig

## Plot feature 1-d scatter plot

In [12]:
res_1d = build_nxm_combination(df_new ,plot_axis[0],2,2)

In [13]:
res_1d

## Plot feature 2 by 2 combination scatter plot

In [14]:
res_2d = build_nxm_combination(df_new ,plot_axis[1],3,2)

In [15]:
res_2d

## Plot feature 3 by 3 combination scatter plot

In [16]:
res_3d = build_nxm_combination(df_new ,plot_axis[2],2,2)

In [17]:
res_3d

In [18]:
ASSETS_FOLDER = "./assets"
folder = Path(ASSETS_FOLDER)
folder.mkdir(parents=True, exist_ok=True)

In [19]:
res_1d.write_image(folder.joinpath("1d-plot.png"),width=1200)
res_2d.write_image(folder.joinpath("2d-plot.png"),width=1200)
res_3d.write_image(folder.joinpath("3d-plot.png"),width=1200)

In [20]:
class KnnForIris:
    LABEL_GROUP = [1,2,3]
    
    @staticmethod
    def build_train_test_dataset(df_in:pd.DataFrame) -> dict[str, pd.DataFrame]:
        
        df_groups = [df_in[df_in["Label"]== label] for label in KnnForIris.LABEL_GROUP]
        df_groups_size = [len(df_tb) for df_tb in df_groups]
        
        train_dataset = [df_tb[:size//2] for df_tb , size in zip(df_groups,df_groups_size)]
        test_dataset = [df_tb[size//2:] for df_tb , size in zip(df_groups,df_groups_size)]
        
        return {
            "train":pd.concat(train_dataset),
            "test":pd.concat(test_dataset),
        }
        
    @staticmethod
    def fit(df_in:pd.DataFrame, x_list:pd.DataFrame , k:int) -> pd.DataFrame :
        # after the select column
        
        result = []
        for _ , row in x_list.iterrows():
            
            distance = sum(
                [(df_in[key] - row[key]) ** 2 for key in list(row.keys())]
            ) ** 0.5
            
            df_dis = pd.DataFrame({
                "dis" : distance , 
                "Label" : df_in["Label"]
            })
            
            # sorted and get the top k
            df_dis = df_dis.sort_values(by="dis").head(k)
            
            label_result = df_dis["Label"].mode()[0]
            
            result.append(label_result)
        
        
        return pd.DataFrame({"pred":result})
    
    @staticmethod
    def acc(predict:pd.DataFrame , true:pd.DataFrame) -> float :
        return sum(predict.values == true.values)[0] / len(true)
    
    @staticmethod
    def get_acc_by_column(df_in:pd.DataFrame, x_list:pd.DataFrame,column:list[str], k:int):
        pred_df = KnnForIris.fit(df_in=df_in[column + ["Label"]] , x_list=x_list[column] , k=k)
        true_df = x_list[["Label"]]
        return KnnForIris.acc(pred_df, true_df)
    
    
    @staticmethod
    def test_all_by_k(plot_axis_in:list , df_in:pd.DataFrame , k:int)->pd.DataFrame:
        combinations_lst = sum(plot_axis_in, [])
        
        dataset = KnnForIris.build_train_test_dataset(df_in)
        
        acc_key = f"Avg acc k={k}"
        
        out_dict = {"Name":[] , acc_key:[]}
        for _ , combination_item in enumerate(combinations_lst):
            combination_item = list(combination_item)
            train_test_order_acc = KnnForIris.get_acc_by_column(dataset["train"] , dataset["test"], combination_item, k) 
            test_train_order_acc = KnnForIris.get_acc_by_column(dataset["test"] , dataset["train"], combination_item, k) 

            avg_acc = np.mean([train_test_order_acc, test_train_order_acc])
            
            out_dict["Name"].append(" x ".join(combination_item))
            out_dict[acc_key].append(f"{avg_acc*100:.2f}%")
        
        return pd.DataFrame(out_dict)
    
    @staticmethod
    def test_all(plot_axis_in:list , df_in:pd.DataFrame) -> pd.DataFrame:
        k_1 = KnnForIris.test_all_by_k(plot_axis_in , df_in , k=1)
        k_3 = KnnForIris.test_all_by_k(plot_axis_in , df_in , k=3)
        
        return pd.merge(k_1 , k_3 ,on="Name" ,how="inner")

In [21]:
result = KnnForIris.test_all(plot_axis , df_ori )

# Avg acc table

In [22]:
result

Unnamed: 0,Name,Avg acc k=1,Avg acc k=3
0,Sepal length,58.00%,62.67%
1,Sepal width,46.00%,46.67%
2,Petal length,93.33%,94.00%
3,Petal width,95.33%,96.00%
4,Sepal length x Sepal width,70.00%,74.67%
5,Sepal length x Petal length,92.67%,92.67%
6,Sepal length x Petal width,87.33%,94.00%
7,Sepal width x Petal length,92.00%,92.00%
8,Sepal width x Petal width,93.33%,95.33%
9,Petal length x Petal width,95.33%,95.33%


In [25]:
result.to_csv(folder.joinpath("Avg acc.csv") , index=False)

In [24]:
md = result.to_markdown()
print(md)

|    | Name                                                    | Avg acc k=1   | Avg acc k=3   |
|---:|:--------------------------------------------------------|:--------------|:--------------|
|  0 | Sepal length                                            | 58.00%        | 62.67%        |
|  1 | Sepal width                                             | 46.00%        | 46.67%        |
|  2 | Petal length                                            | 93.33%        | 94.00%        |
|  3 | Petal width                                             | 95.33%        | 96.00%        |
|  4 | Sepal length x Sepal width                              | 70.00%        | 74.67%        |
|  5 | Sepal length x Petal length                             | 92.67%        | 92.67%        |
|  6 | Sepal length x Petal width                              | 87.33%        | 94.00%        |
|  7 | Sepal width x Petal length                              | 92.00%        | 92.00%        |
|  8 | Sepal width x Petal wid