In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd

style.use('ggplot')

In [1]:
# ================================================================================================================
# ----------------------------------------------------------------------------------------------------------------
#									CLUSTERING
# ----------------------------------------------------------------------------------------------------------------
# ================================================================================================================

# K means clustering is applied to normalized ipl player data

class K_Means:
    def __init__(self, k=3, tolerance=0.0001, max_iterations=500):
        self.k = k
        self.tolerance = tolerance
        self.max_iterations = max_iterations

    def fit(self, data):

        self.centroids = {}

        # initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
        for i in range(self.k):
            self.centroids[i] = data[i]

        # begin iterations
        for i in range(self.max_iterations):
            self.classes = {}
            for j in range(self.k):
                self.classes[j] = []

            # find the distance between the point and cluster; choose the nearest centroid
            for features in data:
                distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
                classification = distances.index(min(distances))
                self.classes[classification].append(features)

            previous = dict(self.centroids)

            # average the cluster datapoints to re-calculate the centroids
            for classification in self.classes:
                self.centroids[classification] = np.average(self.classes[classification], axis=0)

            isOptimal = True

            for centroid in self.centroids:

                original_centroid = previous[centroid]
                curr = self.centroids[centroid]

                if np.sum((curr - original_centroid) / original_centroid * 100.0) > self.tolerance:
                    isOptimal = False

            # break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
            if isOptimal:
                break

    def pred(self, data):
        distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification


def main():
    df = pd.read_csv('/home/ifte/Downloads/ipl.csv')
    df = df[['one']]
    print(df.head())
    dataset = df.astype(float).values.tolist()

    X = df.values  # returns a numpy array
    print(X)
    km = K_Means(3)
    km.fit(X)

    # Plotting starts here
    colors = 10 * ["r", "g", "c", "b", "k"]

    for centroid in km.centroids:
        plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s=130, marker="x")

    for classification in km.classes:
        color = colors[classification]
        for features in km.classes[classification]:
            plt.scatter(features[0], features[1], color=color, s=30)

    plt.show()


# if __name__ == "__main__":
#     main()

In [4]:
df = pd.read_csv('/home/ifte/Downloads/ipl.csv')
df = df[['one']]
print(df.head())
dataset = df.astype(float).values.tolist()

X = df.values  # returns a numpy array
# print(X)
km = K_Means(10)
km.fit(X)


        one
0  0.227680
1  0.979188
2  0.504577
3  0.058132
4  0.775343


In [11]:
km.pred(0.9)

1

In [14]:
km.

{0: array([0.2587777]),
 1: array([0.90913727]),
 2: array([0.50308975]),
 3: array([0.15295676]),
 4: array([0.79406684]),
 5: array([0.56007846]),
 6: array([0.06808672]),
 7: array([0.62449962]),
 8: array([0.70119194]),
 9: array([0.01367519])}

In [6]:
km.classes

{0: [array([0.22767982]),
  array([0.29440819]),
  array([0.3078315]),
  array([0.28369001]),
  array([0.21885608]),
  array([0.21326108]),
  array([0.25948656]),
  array([0.26500836])],
 1: [array([0.97918822]),
  array([0.88606507]),
  array([0.92022078]),
  array([0.86314456]),
  array([0.93502506]),
  array([0.87117996])],
 2: [array([0.5045766]),
  array([0.50777585]),
  array([0.46750149]),
  array([0.52603056]),
  array([0.50327806]),
  array([0.51188718]),
  array([0.46927064]),
  array([0.491552]),
  array([0.5231896]),
  array([0.52210466]),
  array([0.50682057])],
 3: [array([0.18676782]),
  array([0.1434226]),
  array([0.15755852]),
  array([0.11119696]),
  array([0.16583791])],
 4: [array([0.77534302]),
  array([0.75898912]),
  array([0.77913749]),
  array([0.758129]),
  array([0.84201324]),
  array([0.81483483]),
  array([0.80922928]),
  array([0.83377219]),
  array([0.77515337])],
 5: [array([0.55042383]),
  array([0.58576994]),
  array([0.54860594]),
  array([0.53311287