In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
np.random.seed(42)

In [8]:
full_data = pd.read_csv("Iris.csv")

In [9]:
full_data.head(10)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
5,6,5.4,3.9,1.7,0.4,Iris-setosa
6,7,4.6,3.4,1.4,0.3,Iris-setosa
7,8,5.0,3.4,1.5,0.2,Iris-setosa
8,9,4.4,2.9,1.4,0.2,Iris-setosa
9,10,4.9,3.1,1.5,0.1,Iris-setosa


In [10]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

In [11]:
class KMeans():
    def __init__(self, K=5, max_iters=100, plot_steps=False):
        self.K = K
        self.max_iters = max_iters
        self.plot_steps = plot_steps

        # list of sample indices for each cluster
        self.clusters = [[] for i in range(self.K)]
        self.centroids = []

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]

        # Optimize clusters
        for i in range(self.max_iters):
            # Assign samples to closest centroids (create clusters)
            self.clusters = self.create_clusters(self.centroids)
            
            if self.plot_steps:
                self.plot()

            # new centroids 
            centroids_old = self.centroids
            self.centroids = self.get_centroids(self.clusters)
            
            # check clusters change
            if self.is_converged(centroids_old, self.centroids):
                break

            if self.plot_steps:
                self.plot()

        # Classify use index 
        return self.get_cluster_labels(self.clusters)


    def get_cluster_labels(self, clusters):
       # cluster  assigned 
        labels = np.empty(self.n_samples)

        for cluster_idx, cluster in enumerate(clusters):
            for sample_index in cluster:
                labels[sample_index] = cluster_idx
        return labels

    def create_clusters(self, centroids):
        # closest centroid and cluster
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self.closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def closest_centroid(self, sample, centroids):
        # smallest distance from centroids
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_index = np.argmin(distances)
        return closest_index

    def get_centroids(self, clusters):
        # mean cluster for centroid
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def is_converged(self, centroids_old, centroids):
        # overall change in centroids
        distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
        return sum(distances) == 0

    def plot(self):
        fig, ax = plt.subplots(figsize=(12, 8))

        for i, index in enumerate(self.clusters):
            point = self.X[index].T
            ax.scatter(*point)

        for point in self.centroids:
            ax.scatter(*point, marker="x", color='black', linewidth=2)

        plt.show()

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
Xdata=full_data.to_numpy()
Xdata = Xdata[:,1:5]

Ydata=full_data.to_numpy()
Ydata = Ydata[:,5]

In [14]:
print(Xdata[1])
print(Ydata[1])

[4.9 3.0 1.4 0.2]
Iris-setosa


In [15]:
xtrain,ytrain,xtest,ytest = train_test_split(Xdata,Ydata,test_size=0.3)
clusters = len(np.unique(ytrain))

In [16]:
iris_clusters = KMeans(K=3,max_iters=20)

In [17]:
predict = iris_clusters.predict(xtrain)


In [18]:
len(predict)

105

In [19]:
store = predict

In [20]:
predict

array([0., 0., 2., 0., 2., 0., 2., 0., 1., 2., 0., 1., 1., 1., 0., 0., 1.,
       1., 1., 0., 1., 0., 2., 1., 0., 0., 1., 0., 0., 0., 0., 2., 0., 1.,
       0., 2., 1., 1., 0., 2., 1., 0., 1., 1., 0., 0., 2., 0., 2., 2., 0.,
       1., 1., 0., 2., 1., 1., 1., 0., 2., 1., 2., 2., 1., 0., 0., 0., 2.,
       2., 1., 2., 0., 2., 0., 0., 0., 1., 0., 0., 1., 0., 2., 2., 1., 0.,
       2., 2., 1., 2., 1., 2., 2., 2., 0., 2., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 2.])

In [21]:
xtest

array(['Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
      

In [22]:
df = pd.DataFrame()

In [23]:
df["predict"]=store

In [24]:
df["value"]=xtest

In [25]:
df[df["predict"]==0.0]["value"].value_counts()

Iris-versicolor    34
Iris-virginica     11
Name: value, dtype: int64

In [26]:
df[df["predict"]==1.0]["value"].value_counts()

Iris-setosa    31
Name: value, dtype: int64

In [27]:
df[df["predict"]==2.0]["value"].value_counts()

Iris-virginica     26
Iris-versicolor     3
Name: value, dtype: int64

In [28]:
print("accuracy ratio is",(28+34+32)/105)

accuracy ratio is 0.8952380952380953


In [29]:
data= pd.read_csv("Mall_Customers.csv")

In [30]:
data.head(10)

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [31]:
len(data["Spending Score (1-100)"].unique())

84

In [32]:
X2data=data.to_numpy()
X2data = X2data[:,2:5]

Y2data=data.to_numpy()
Y2data = Y2data[:,4]

In [None]:
len(data)

In [130]:
mall = KMeans(84,100)

In [131]:
predict=mall.predict(X2data)

In [132]:
predict

array([15.,  5., 33.,  5., 28.,  5., 33., 34., 73.,  1., 73., 69., 73.,
        5., 33.,  5., 28.,  1., 16., 69., 27., 70., 79., 70., 79., 34.,
       16., 81., 27., 34., 73., 42., 79., 55., 79., 58., 79.,  7., 48.,
       42.,  4., 47., 31., 29., 31., 29., 11., 17., 63., 63., 11., 29.,
       83., 80., 40., 31., 40.,  4., 83., 40., 64.,  3., 64., 80., 12.,
       22., 30.,  4., 22., 83., 64., 19., 12., 80., 18., 71., 30., 30.,
       71., 19., 18., 30.,  4., 19., 51., 19., 18., 51., 43., 44.,  8.,
       10., 44., 60., 59., 36., 44., 36., 44., 37., 10., 44.,  8., 56.,
       46., 10.,  8., 44., 62., 62.,  8., 65., 59., 37., 37., 37., 62.,
       38., 44., 38., 56., 59., 49., 75., 23., 13., 66., 75.,  6., 82.,
       24., 82., 23., 13., 57., 54., 24., 13., 57., 13., 25., 74., 23.,
       54., 57., 74., 66., 13.,  0., 74.,  0., 39.,  0., 61.,  0., 54.,
       68., 77., 68., 77.,  9., 54., 20., 74., 14., 61., 14., 67., 14.,
       78., 14., 77., 14., 67., 45., 67., 32., 77., 26., 67., 41

In [134]:
Y2data

array([39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35,
       66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92,
       14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52,
       60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48,
       59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57,
       46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48,
       59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43,
       57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88,
       7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20,
       76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63,
       13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24,
       68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83], dtype=object)

In [135]:
df2 = pd.DataFrame()

In [136]:
df2["spending"]=Y2data

In [137]:
df2["cluster"]=predict

In [138]:
df2

Unnamed: 0,spending,cluster
0,39,15.0
1,81,5.0
2,6,33.0
3,77,5.0
4,40,28.0
...,...,...
195,79,2.0
196,28,53.0
197,74,52.0
198,18,53.0


In [140]:
print(df2["cluster"].value_counts())

67.0    7
44.0    7
79.0    5
13.0    5
5.0     5
       ..
3.0     1
39.0    1
49.0    1
65.0    1
35.0    1
Name: cluster, Length: 84, dtype: int64
