In [2]:
from copy import deepcopy
import numpy as np
import pandas as pd
import time
from sklearn.datasets import make_blobs
from joblib import Parallel,delayed
# Importing the dataset
data = pd.read_csv('xclara.csv')
print("Input Data and Shape")
print(data.shape)
data.head()

# Getting the values and plotting it
f1 = data['V1'].values
f2 = data['V2'].values
X = np.array(list(zip(f1, f2)))


# Euclidean Distance Caculator
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

Input Data and Shape
(3000, 2)


In [16]:
# Number of clusters
k = 3
# X coordinates of random centroids
C_x = np.random.randint(0, np.max(X)-20, size=k)
# Y coordinates of random centroids
C_y = np.random.randint(0, np.max(X)-20, size=k)
C = np.array(list(zip(C_x, C_y)), dtype=np.float32)
print("Initial Centroids")
print(C)

# To store the value of centroids when it updates
C_old = np.zeros(C.shape)
# Cluster Lables(0, 1, 2)
clusters = np.zeros(len(X))
# Error func. - Distance between new centroids and old centroids
error = dist(C, C_old, None)
# Loop will run till the error becomes zero

def assignment(i,x):
    distances = dist(x[i], C)
    cluster = np.argmin(distances)
    clusters[i] = cluster
    return clusters[i]

def centroidcompute(i,x):
    points = [X[j] for j in range(len(X)) if clusters[j] == i]
    return np.mean(points, axis=0)


if __name__ == '__main__':
    start = time.time()
    while error != 0:
        #for i in range(len(X)):
            #distances = dist(X[i], C)
            #cluster = np.argmin(distances)
            #clusters[i] = cluster
            #assignment(i,X)
        clusters = Parallel(n_jobs=1)(delayed(assignment)(i,X) for i in range(len(X)))

        C_old = deepcopy(C)
        #for i in range(k):
            #for i in range(k):
                #points = [X[j] for j in range(len(X)) if clusters[j] == i]
                #C[i] = np.mean(points, axis=0)
        C = np.array(Parallel(n_jobs=-1)(delayed(centroidcompute)(i,X) for i in range(k)))
        error = dist(C, C_old, None)
            


    end = time.time()
    print('cluster time %.5f' %(end-start))
    print("Final Centroid Values")
    print("From scratch done by us:")
    print(C) # From Scratch

Initial Centroids
[[27. 82.]
 [49. 76.]
 [72. 33.]]
cluster time 1.50955
Final Centroid Values
From scratch done by us:
[[  9.4780459   10.686052  ]
 [ 40.68362784  59.71589274]
 [ 69.92418447 -10.11964119]]


In [3]:
'''
==========================================================
scikit-learn
==========================================================
'''

from sklearn.cluster import KMeans

start = time.time()
# Number of clusters
kmeans = KMeans(n_clusters=3)
# Fitting the input data
kmeans = kmeans.fit(X)
# Getting the cluster labels
labels = kmeans.predict(X)

# Centroid values
centroids = kmeans.cluster_centers_
end = time.time()
print('cluster time %.5f' %(end-start))
# Comparing with scikit-learn centroids

print("From scikit-learn package:")
print(centroids) # From sci-kit learn

cluster time 0.02091
From scikit-learn package:
[[ 40.68362784  59.71589274]
 [ 69.92418447 -10.11964119]
 [  9.4780459   10.686052  ]]


In [13]:
from matplotlib.path import Path
import time
import sys

# Check if one line segment contains another. 
def check_paths(path):
    res='no cross'
    for other_path in a:
        chck = Path(other_path)
        if chck.contains_path(path)==1:
            res= 'cross'
            break
    return res

if __name__ == '__main__':
    ## Create pairs of points for line segments
    a = zip(np.random.rand(5000,2),np.random.rand(5000,2))
    b = zip(np.random.rand(300,2),np.random.rand(300,2))
    now = time.time()
    for points in b:
        check_paths(Path(points))
    #res = Parallel(n_jobs=-1,backend='threading') (delayed(check_paths) (Path(points)) for points in b)
   
    print("Finished in", time.time()-now , "sec")

Finished in 0.08580350875854492 sec


In [34]:
from math import sqrt
now = time.time()
com = [sqrt(i ** 2) for i in range(10000000)]
print("for loop Finished in", time.time()-now , "sec")
now = time.time()
Parallel(n_jobs=3)(delayed(sqrt)(i ** 2) for i in range(1000000))
print("parallel Finished in", time.time()-now , "sec")

for loop Finished in 5.753977060317993 sec
parallel Finished in 24.810579776763916 sec
