In [19]:
import pandas as pd
import numpy as np

In [20]:
def accuracy(test_preds_label, y_test):
    unique_labels = set(y_test) | set(test_preds_label)
    conf_matrix = {pred: {actual: 0 for actual in unique_labels} for pred in unique_labels}
    for pred, actual in zip(test_preds_label, y_test):
        conf_matrix[pred][actual] += 1
    correct = sum(conf_matrix[label][label] for label in unique_labels)
    total = sum(sum(inner.values()) for inner in conf_matrix.values())
    acc = correct / total
    print("Confusion Matrix:")
    for pred in sorted(unique_labels):
        row = [conf_matrix[pred][actual] for actual in sorted(unique_labels)]
        print(f"Pred {pred}: {row}")
    print("Accuracy :", acc)

In [21]:
def model_predict(x_test, cluster_arr, c1, c2, c3, train):
    cluster_to_label = {}
    for cluster_num in [1, 2, 3]:
        labels_in_cluster = train['species'].values[cluster_arr == cluster_num]
        if len(labels_in_cluster) > 0:
            label_counts = {}
            for label in labels_in_cluster:
                label_counts[label] = label_counts.get(label, 0) + 1
            most_common = max(label_counts, key=label_counts.get)
            cluster_to_label[cluster_num] = most_common

    test_preds_cluster = []
    for point in x_test:
        d1 = np.linalg.norm(point - c1)
        d2 = np.linalg.norm(point - c2)
        d3 = np.linalg.norm(point - c3)
        if d1 <= d2 and d1 <= d3:
            test_preds_cluster.append(1)
        elif d2 <= d1 and d2 <= d3:
            test_preds_cluster.append(2)
        else:
            test_preds_cluster.append(3)

    test_preds_label = [cluster_to_label[c] for c in test_preds_cluster]
    return test_preds_label

In [22]:
def model_fit(train):
    data = train.drop(columns=['species']).values.astype(float)

    c1 = train.drop(columns=['species']).iloc[0].values
    c2 = train.drop(columns=['species']).iloc[1].values
    c3 = train.drop(columns=['species']).iloc[2].values

    cluster_arr = np.full(shape=(data.shape[0],), fill_value=-1)
    for i in range(data.shape[0]):
        point = data[i]
        d1 = np.linalg.norm(point - c1)
        d2 = np.linalg.norm(point - c2)
        d3 = np.linalg.norm(point - c3)
        min_dist = min(d1, d2, d3)
        if min_dist == d1:
            cluster_arr[i] = 1
        elif min_dist == d2:
            cluster_arr[i] = 2
        else:
            cluster_arr[i] = 3

    while True:
        changed = 0
        new_cluster_arr = np.full(shape=(data.shape[0],), fill_value=-1)

        for i in range(data.shape[0]):
            point = data[i]
            d1 = np.linalg.norm(point - c1)
            d2 = np.linalg.norm(point - c2)
            d3 = np.linalg.norm(point - c3)

            if d1 <= d2 and d1 <= d3:
                new_cluster_arr[i] = 1
            elif d2 <= d1 and d2 <= d3:
                new_cluster_arr[i] = 2
            else:
                new_cluster_arr[i] = 3

            if new_cluster_arr[i] != cluster_arr[i]:
                changed += 1

        cluster_arr = new_cluster_arr.copy()

        c1_points = data[cluster_arr == 1]
        c2_points = data[cluster_arr == 2]
        c3_points = data[cluster_arr == 3]

        c1_past, c2_past, c3_past = c1.copy(), c2.copy(), c3.copy()

        if len(c1_points) > 0:
            c1 = np.mean(c1_points, axis=0)
        if len(c2_points) > 0:
            c2 = np.mean(c2_points, axis=0)
        if len(c3_points) > 0:
            c3 = np.mean(c3_points, axis=0)

        min_change_threshold = min(5, int(0.01 * data.shape[0]))
        if (
            np.linalg.norm(c1 - c1_past) < 0.01 and
            np.linalg.norm(c2 - c2_past) < 0.01 and
            np.linalg.norm(c3 - c3_past) < 0.01
        ) or changed <= min_change_threshold:
            break

    return cluster_arr, c1, c2, c3

In [23]:
df = pd.read_csv("IRIS.xls")
unique_species = df['species'].unique()
label_map = {label: idx for idx, label in enumerate(unique_species)}

df['species'] = df['species'].map(label_map)

In [24]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
train = df_shuffled.iloc[:120]
test = df_shuffled.iloc[120:]

In [25]:
x_test = test.drop(columns=['species']).values.astype(float)
y_test = test['species'].values

In [26]:
cluster_arr, c1, c2, c3 = model_fit(train)
test_preds_label = model_predict(x_test, cluster_arr, c1, c2, c3, train)

In [27]:
accuracy(test_preds_label, y_test)

Confusion Matrix:
Pred 0: [7, 0, 0]
Pred 1: [0, 11, 5]
Pred 2: [0, 0, 7]
Accuracy : 0.8333333333333334
