# K-means clustering

Използвайте алгоритъма kMeans, за да се определят автоматично групи от сходни елементи за наборите от данни normal.txt и unbalance.txt

Вход:
Име на файл и брой клъстери 

Изход:
Картинка, която показва клъстери в различни цветове. (Всички примери от приложените бази са описани чрез два атрибута - x и y, които са разположението на точката в евклидовото простанство)

* В решението приложете Random Restart с оценка колко са добри постигнатите клъстери. Като оценка можете да ползвате "вътрешно клъстерно разстояние", "междуклъстерно разстояние" и комбинация от двете. Опитайте се да сравните резултатите(можете да сравните и с оценка различна от предложените).

* Като допълнение можете да приложите ***kMeans++*** и да сравните резултатите.
* Като допълнение можете да приложите ***Soft kMeans*** и да сравните резултатите. 


In [2]:
import math
import numpy as np
import matplotlib.pyplot as plt
import argparse
from typing import List, Dict

filepath: str = "./unbalance.txt"
# filepath: str = "./normal.txt"

K: int = 3 # centroid(cluster) count 


points: List[List[float]] = []

try:
    with open(filepath, 'r') as file:
        points = [list(map(float, line.split())) 
                  for line in file.readlines()]
except Exception as e:
    print(f"Reading error{e}")

points: np.ndarray = np.array(points)

print(points.shape)


class KMeans:
    def __init__(self, 
                 points: np.ndarray, k: int = K) -> None:
        self.points: np.ndarray = points
        self.centroids: List[np.ndarray] = []
        self.assignments: List[int] = [-1 for _ in range(len(self.points))]
        self.kmpp_init(k)

    @property
    def K(self) -> int:
        return len(self.centroids)

    def rand_init(self, k: int) -> None:
        indx: np.ndarray = np.random.choice(np.arange(len(self.points)), k, replace=False)
        self.centroids = points[indx]


    def assign(self) -> None:
        
        temp = []
        for point in self.points:
            dists = [np.linalg.norm(c - point) for c in self.centroids]
            temp.append(self.centroids[dists.index(min(dists))])
        self.assignments = temp
        # self.assignments = [self.centroids[argmin([np.linalg.norm(self.points[i] - point) for i in self.centroids])] for point in self.points]

    def compute_centroids(self) -> None:
        for c in self.centroids:
            points = np.array([point for idx, point in enumerate(self.points) if not np.all(self.assignments[idx] == c)])
            c = np.mean(points)

    def run(self) -> None:
        for _ in range(10):
            print(self.intra_cluster_distance)
            self.assign()
            self.compute_centroids()

    @property
    def intra_cluster_distance(self) -> float:
        temp: float = 0.0
        for point in self.points:
            dists = [np.linalg.norm(c - point) for c in self.centroids]
            temp += np.sum(dists)
        return temp


    def kmpp_init(self, k: int) -> None:
        """? add softmax function?"""
        centroid_ids: List[int] = []
        centroid_ids.append(int(np.random.choice(np.arange(len(self.points)), 1, replace=False)))

        self.assignments[centroid_ids[0]] = centroid_ids[0]

        for _ in range(k - 1):
            distances = np.array([min(np.linalg.norm(point - self.points[i]) ** 2 
                                 for i in centroid_ids) for point in self.points])
            
            probabilities = distances / distances.sum()
            
            next_centroid_id = np.random.choice(len(self.points), p=probabilities)
            assert next_centroid_id not in centroid_ids
            centroid_ids.append(next_centroid_id)
            self.assignments[next_centroid_id] = next_centroid_id

        self.centroids = self.points[np.array(centroid_ids)]

    def plot_clusters(self):
        unique_assignments, mapping = np.unique(self.assignments, return_inverse=True)
        colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown']  # Add more colors as needed

        for i, cluster_id in enumerate(unique_assignments):
            cluster_points = self.points[self.assignments == cluster_id]
            plt.scatter(cluster_points[:, 0], cluster_points[:, 1], c=colors[i], label=f'Cluster {cluster_id}')

        plt.scatter(self.centroids[:, 0], self.centroids[:, 1], marker='X', c='black', label='Centroids', s=100)
        plt.title('K-Means Clustering')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')
        plt.legend()
        plt.show()


# temp = KMeans(points)
# temp.assign()
# # print(temp.assignments)
# temp.run()
# uv, mp = np.unique(temp.assignments, return_inverse=True, axis=0)
# print(uv, len(uv))
# print(temp.assignments)

# colors = ['red', 'green', 'blue']

# temp.plot_clusters()

# print(temp.K)
# def rand_init(n: int, dim: int) -> np.ndarray:
#     return np.random.rand(n, dim)

# points: np.ndarray = rand_init(10, 2)

# x_coordinates = points[:, 0]
# y_coordinates = points[:, 1]

# plt.scatter(x_coordinates, y_coordinates, marker='o', color='blue', label='Points')

# centroids: np.ndarray

# print(points)
# print(np.linalg.norm(points[1] - points[2]))


(6500, 2)
