# Kmean Algorithm

## 1/ Import Library

In [68]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [30]:
data = datasets.load_iris()

## 2/ Data Preprocessing

In [31]:
X, y = data.data, data.target

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [33]:
y_test

array([0, 2, 0, 1, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 1, 2, 2, 1, 0, 1, 0, 1,
       2, 1, 0, 2, 1, 1, 0, 0])

# 3/ Build Kmean Model

### Steps

- Pick K points as the initial centroids from the data set, either randomly or the first K.

- Find the Euclidean distance of each point in the data set with the identified K points — cluster centroids.

- Assign each data point to the closest centroid using the distance found in the previous step.

- Find the new centroid by taking the average of the points in each cluster group.

- Repeat 2 to 4 for a fixed number of iteration or till the centroids don’t change.

In [34]:
# def Euclid_distance

In [71]:
class K_Means:
    def __init__(self, k=3, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.y_pred = []

    def fit(self,data):

        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = data[i]

        for i in range(self.max_iter):
            self.classifications = {}

            for i in range(self.k):
                self.classifications[i] = []

            for featureset in data:
                distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classifications[classification].append(featureset)

            prev_centroids = dict(self.centroids)

            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification],axis=0)

            optimized = True

            for c in self.centroids:
                original_centroid = prev_centroids[c]
                current_centroid = self.centroids[c]
                if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
                    print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
                    optimized = False

            if optimized:
                break

    def predict(self,data):
        self.y_pred = []
        
        for item in data:
            distances = [np.linalg.norm(item-self.centroids[centroid]) for centroid in self.centroids]
            self.y_pred.append(distances.index(min(distances)))
        return self.y_pred
    def accuracy_score(self, y):
        return accuracy_score(y, self.y_pred)

In [72]:
model = K_Means()
model.fit(X_train)

39.21808171808176
6.957008508732689
3.0639249991458577
2.0278395332761177
4.177142066387263
1.4965407774334174
2.5416006288936552
2.231513454035605
5.532586259646415
2.315719385041501
5.68966378211523
3.890909526604365
3.081886950018035
1.6091272594724089
5.035746168951553
4.327374296556268
3.258467003008441
3.606502886356999
1.2626449997105142
3.5354787299760266
1.7141658734283467
0.6837222285962974


In [74]:
y_pred = model.predict(X_test)

In [75]:
model.accuracy_score(y_test)

0.9