In [2]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


with open('data/iris.names', "r") as f:
    dialog = ""
    for i in f.readlines():
        dialog += i
print(dialog)

ModuleNotFoundError: No module named 'numpy'

In [None]:
# iris_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)
iris_df = pd.read_csv('data/iris.data', header=None)
print(iris_df.head())

In [None]:
print(iris_df.shape)

In [None]:
# 정답 라벨 추출
y = iris_df.iloc[:, 4].values

# 꽃받침의 길이와 꽃잎 길이등의 데이터 추출
X = iris_df.iloc[:, :4].values

In [None]:
print(X.shape)
print(y.shape)

In [None]:
label_lst = list(set(y))
label_dict = {label_name : label_index for label_index, label_name in enumerate(label_lst)}

Y = np.array([label_dict[name] for name in y])
print(set(y))
print(label_dict)
print(Y)

In [None]:
from collections import defaultdict

class OneHotEncoder(object):
    r"""
    OneHotEncoder make one hot encoding for input class list
    """
    def __init__(self, class_lst:list= None )->None:
        assert class_lst is not None, f"Must be input num or class_lst"
        self.word2idx = defaultdict()

        class_lst = set(class_lst)
        for idx, name in enumerate(class_lst):
            self.word2idx[name] = idx
            
        self.make_matrix(len(class_lst))

    
    def make_matrix(self, num):
        self.one_hot_matrix = np.eye(num)
    
    def __call__(self, cls):
        return self.one_hot_matrix[self.word2idx[cls]]

encoder = OneHotEncoder(y)
Y       = np.array([encoder(i) for i in y])
print(Y)
print(Y.shape)

In [None]:
print(y)

In [None]:
%matplotlib inline

rows,cols = X.shape
index_lst = [[i, j] for i in range(0, cols) for j in range(i+1, cols)]
colors = ["r","g","b"]
labels = list(set(y))
names  = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"]
print(index_lst)

fig, ax = plt.subplots(2,3, figsize=(16,8))

for r in range(2):
    for c in range(3):
        com_idx = r * 3 + c
        i, j  = index_lst[com_idx]
        for idx, label in enumerate(labels):
            data = X[y == label,:]
            data = data[:, [i,j]]
            ax[r,c].scatter(data[:,0], data[:,1], c=colors[idx], label=label)
            ax[r,c].set_xlabel(names[i])
            ax[r,c].set_ylabel(names[j])
        
        ax[r,c].legend()
        ax[r,c].grid(True)
plt.show()



In [None]:
class randomSplit(object):
    r"""
    Make Random Split using index list for iris data
    """
    def __init__(self, datasets:np.array, labels:np.array, split_rate:float=0.8)->None:
        self.datasets = datasets
        self.labels   = labels
        self.split_rate = split_rate
        self.make_index()
        
    def make_index(self):
        # Make Index List and Shuffle
        cls_idx_lst = np.arange(0,150).reshape(3,-1)
        list(map(np.random.shuffle, cls_idx_lst))
        
        # Compute Split Value
        n_cls, n_instance = np.shape(cls_idx_lst)
        train_value = int(n_instance * self.split_rate)
        
        # Make list
        self.train_lst = cls_idx_lst[:, :train_value].flatten()
        self.valid_lst = cls_idx_lst[:, train_value:].flatten()
    
    def shuffle(self):
        np.random.shuffle(self.train_lst)
        np.random.shuffle(self.valid_lst)
    
    def __call__(self):
        return (self.datasets[self.train_lst, :], self.labels[self.train_lst]), \
                (self.datasets[self.valid_lst, :], self.labels[self.valid_lst])


dataloader = randomSplit(X, Y)
train, valid = dataloader()
dataloader.train_lst
print(train)

In [None]:
def euclidean_distance(x:np.array, y:np.array) -> np.array:
    r"""
    Args :
        x : np array(4, )
        y : np.array(B, 4), B is Number of Training Dataset
    Returns:
        np.array(B,)
    """
    return np.sqrt(np.sum(np.square(x - y), axis=1)).T




In [None]:
class KNN(object):
    r"""
    Simple K-NN algorithm for iris dataset.
    Just using numpy and Euclidean Distance
    """
    def __init__(self, k:int, datasets:np.array, labels:np.array)->None:
        self.k = k 
        self.datasets = datasets
        self.labels   = labels
        self.compute  = euclidean_distance
        self.label2name  = {j:i for i, j in label_dict.items()}

    def __call__(self, data:np.array)->np.array:
        distance  = self.compute(data, self.datasets)
        score_idx = np.argsort(distance)[:self.k]
        k_labels  = self.labels[score_idx]
        k_counts  = Counter(k_labels)
        inf_labels= list(k_counts.keys())[0]
        inf_name  = self.label2name[inf_labels]
        return inf_labels, inf_name


knn = KNN(3, train[0], train[1])
print(knn(valid[0][0]))
print(valid[1][0])

In [None]:

rows,cols = X.shape
index_lst = [[i, j] for i in range(0, cols) for j in range(i+1, cols)]
colors = ["r","g","b", "k"]
labels = list(set(y))
names  = ["Sepal Length", "Sepal Width", "Petal Length", "Petal Width"]
print(index_lst)

fig, ax = plt.subplots(2,3, figsize=(16,8))

for r in range(2):
    for c in range(3):
        com_idx = r * 3 + c
        i, j  = index_lst[com_idx]
        for idx, label in enumerate(labels):
            data = X[y == label,:]
            data = data[:, [i,j]]
            ax[r,c].scatter(data[:,0], data[:,1], c=colors[idx], label=label)
            ax[r,c].set_xlabel(names[i])
            ax[r,c].set_ylabel(names[j])
            
        valid_data = valid[0]
        valid_data = valid_data[:, [i,j]]
        ax[r,c].scatter(valid_data[:,0][0],valid_data[:,1][1], c=colors[-1], label="validation")
        ax[r,c].legend()
        ax[r,c].grid(True)
plt.show()

In [None]:
def Accuracy(x:np.array, y:np.array) -> float:
    r"""
    Accurcay Function is metric function, for iris dataset
    Args :
        x : np.array(B,) has inference labels, B is number of dataset
        y : np.array(B,) has real labes, B is number of dataset
    Return : 
        float value how to many access for inferencing
    """
    n = len(x)
    count = np.sum(np.array(x == y).astype(np.int64)) 
    return (count/n) * 100

In [None]:
next(zip(valid[0], valid[1]))

In [None]:
knn = KNN(3, train[0], train[1])

inference = []
for data in valid[0]:
    labels, names = knn(data)
    inference.append(labels)
inference = np.array(inference)

print(f"붓꽃 데이터셋의 최종 성능 : {Accuracy(inference, valid[1]):.4f} %")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric="minkowski")
knn.fit(train[0], train[1])
knn.predict(valid[0])