In [None]:
from copy import deepcopy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

# Importing the dataset
data = pd.read_csv('xclara.csv')
print("Input Data and Shape")
print(data.shape)
data.head()

# Getting the values and plotting it
f1 = data['V1'].values
f2 = data['V2'].values
X = np.array(list(zip(f1, f2)))
plt.scatter(f1, f2, c='black', s=7)

# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

# Number of clusters
k = 3
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)

# Plotting along with the Centroids
plt.scatter(f1, f2, c='#050505', s=7)
plt.scatter(C_x, C_y, marker='*', s=200, c='g')

# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Error func. - Distance between new centroids and old centroids
error = dist(C, C_old, None)
# Loop will run till the error becomes zero
while error != 0:
    # Assigning each value to its closest cluster
    for i in range(len(X)):
        distances = dist(X[i], C)
        cluster = np.argmin(distances)
        clusters[i] = cluster
    # Storing the old centroid values
    C_old = deepcopy(C)
    # Finding the new centroids by taking the average value
    for i in range(k):
        points = [X[j] for j in range(len(X)) if clusters[j] == i]
        C[i] = np.mean(points, axis=0)
    error = dist(C, C_old, None)

colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
        points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
        ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='#050505')

In [32]:
# v2

# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

def kmeans_2(k, file_name):
    data = pd.read_csv(file_name, header = None)

    x1 = data[0].values
    x2 = data[1].values
    X = np.array(list(zip(x1, x2)))
    
    plt.scatter(x1, x2, c='black', s=7)

    # X coordinates of random centroids
    C_x = np.random.randint(0, np.max(x1), size=k)
    # Y coordinates of random centroids
    C_y = np.random.randint(0, np.max(x2), size=k)
    
    C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
    print("Initial Centroids")
    print(C)
    
    # Plotting along with the Centroids
    plt.scatter(x1, x2, c='#050505', s=7)
    plt.scatter(C_x, C_y, marker='*', s=200, c='g')

    # To store the value of centroids when it updates
    C_old = np.zeros(C.shape)
    # Cluster Lables(0, 1, 2)
    clusters = np.zeros(len(X))
    # Error func. - Distance between new centroids and old centroids
    error = dist(C, C_old, None)
    # Loop will run till the error becomes zero
    while error != 0:
        # Assigning each value to its closest cluster
        for i in range(len(X)):
            distances = dist(X[i], C)
            cluster = np.argmin(distances)
            clusters[i] = cluster
        # Storing the old centroid values
        C_old = deepcopy(C)
        # Finding the new centroids by taking the average value
        for i in range(k):
            points = [X[j] for j in range(len(X)) if clusters[j] == i]
            C[i] = np.mean(points, axis=0)
        error = dist(C, C_old, None)
    colors = ['r', 'g', 'b', 'y', 'c', 'm']
    fig, ax = plt.subplots()
    for i in range(k):
            points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
            ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
    ax.scatter(C[:, 0], C[:, 1], marker='*', s=200, c='#050505')
    print(X)
    print(clusters)
    Out = np.array(list(zip(X[:, 0], X[:, 1], clusters)))
    df = pd.DataFrame(Out)
    return df

In [34]:
df1 = kmeans(3, 'xclara_test.csv')

           0          1
0   2.072345  -3.241693
1  17.936710  15.784810
2   1.083576   7.319176
3  11.120670  14.406780
4  23.711550   2.557729
5  24.169930  32.024780
6  21.665780   4.892855
7   4.693684  12.342170
Initial Centroids
[[24.16993   24.16993  ]
 [12.084965  12.084965 ]
 [ 8.0566435  8.0566435]]



In [35]:
df1[2].value_counts()

0.0    8
Name: 2, dtype: int64

In [116]:
def distance(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)
            
def get_new_labels_iteration(X, centroids):
    new_labels = [np.argmin(distance(X[i], centroids)) for i in range(len(X))]
    return new_labels

def kmeans(k, file_name):
    data = pd.read_csv(file_name, header = None)

    x1 = data[0].values
    x2 = data[1].values
    
    centroid_x = np.random.randint(0, np.max(x1), size=k)
    centroid_y = np.random.randint(0, np.max(x2), size=k)
    
    X = np.array(list(zip(x1, x2)))
    centroids = np.array(list(zip(centroid_x, centroid_y)))
    temp_centroids = np.zeros(centroids.shape)
    cluster_labels = np.zeros(len(X))

    error = distance(centroids, temp_centroids, None)

    while error != 0:
        cluster_labels = get_new_labels_iteration(X, centroids)
        temp_centroids = centroids.copy()
        for i in range(k):
            points = [X[j] for j in range(len(X)) if cluster_labels[j] == i]
            centroids[i] = np.mean(points, axis=0)
        error = distance(centroids, temp_centroids, None)
        
    Out = np.array(list(zip(X[:, 0], X[:, 1], cluster_labels)))
    df = pd.DataFrame(Out)
    return df

In [117]:
df2 = kmeans(3, 'xclara_test.csv')

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [115]:
df2[2].value_counts()

0.0    4
2.0    3
1.0    1
Name: 2, dtype: int64

In [106]:
df2

Unnamed: 0,0,1,2
0,2.072345,-3.241693,2.0
1,17.93671,15.78481,1.0
2,1.083576,7.319176,2.0
3,11.12067,14.40678,1.0
4,23.71155,2.557729,1.0
5,24.16993,32.02478,0.0
6,21.66578,4.892855,1.0
7,4.693684,12.34217,2.0


In [None]:
def distance(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

def get_new_centroid(X, i, cluster_labels):
    points = [X[j] for j in range(len(X)) if cluster_labels[j] == i]
    return np.mean(points, axis=0)            
            
def get_new_labels_iteration(X, centroids):
    new_labels = [np.argmin(distance(X[i], centroids)) for i in range(len(X))]
    return new_labels

def kmeans(k, file_name):
    data = pd.read_csv(file_name, header = None)

    x1 = data[0].values
    x2 = data[1].values
    
    centroid_x = np.random.randint(0, np.max(x1), size=k)
    centroid_y = np.random.randint(0, np.max(x2), size=k)
    
    X = np.array(list(zip(x1, x2)))
    centroids = np.array(list(zip(centroid_x, centroid_y)))
    temp_centroids = np.zeros(centroids.shape)
    cluster_labels = np.zeros(len(X))

    error = distance(centroids, temp_centroids, None)

    while error != 0:
        cluster_labels = get_new_labels_iteration(X, centroids)
        temp_centroids = centroids.copy()
        centroids = [get_new_centroid(X, i, cluster_labels) for i in range(k)]
        error = distance(centroids, temp_centroids, None)
        
    Out = np.array(list(zip(X[:, 0], X[:, 1], cluster_labels)))
    df = pd.DataFrame(Out)
    return df