Homework 3

In [1]:
# Import Packages
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
import random
import time

In [2]:
def readVectorsSeq(filename):
    file = open(filename, 'r')
    vector_list = []
    for row in file.readlines():
        vector_list.append(Vectors.dense([float(num_str) for num_str in row.split()]))
    return vector_list

In [3]:
vector_list = readVectorsSeq('test-datasets/prova.txt')

# Print Info of the Dataset
print('INFO OF THE DATASET:')
print('- Number of points:', len(vector_list))
print('- Dimensionality of a Point:', len(vector_list[0]))

INFO OF THE DATASET:
- Number of points: 11
- Dimensionality of a Point: 50


Implementation of *kcenter* using the Farthest-First Traversal

In [52]:
def findFarthest(a,P):
    dist = 0
    index = 0
    for idx,el in enumerate(P):
        temp_dist = a.squared_distance(el)
        if temp_dist > dist:
            dist = temp_dist
            index = idx
    return P[index], index

def kcenter(P,k):
    C = []
    C_idx = []
    C.append(P[0])
    C_idx.append(0)
    for i in range(k-1):
        new_centroid, new_idx = findFarthest(C[-1],[x for idx,x in enumerate(P) if idx != C_idx])
        C.append(new_centroid)
        C_idx.append(new_idx)
    return C    

In [53]:
findFarthest(vector_list[0],[x for idx,x in enumerate(vector_list) if idx != 0])

(DenseVector([-0.5222, 0.4912, 0.4791, -0.8636, -2.5391, -2.9868, 0.58, -0.9677, -0.6864, 0.5162, -0.8788, -1.0081, 0.8865, 0.2876, 0.0573, 1.4542, 0.3276, -0.6853, 0.6394, 1.0876, -0.1458, -0.2659, 1.0826, -0.5432, -0.0359, 1.0447, -2.1232, 0.8778, -0.8194, 0.1961, 0.1237, -0.0257, -0.2843, 2.062, -0.1463, -0.3462, 0.8561, -0.3275, 0.5219, 2.0741, -1.4375, 1.4922, -0.4392, -0.1354, 0.3518, -1.8033, 0.5764, -1.4669, -0.2385, 0.6649]),
 41314)

# Elapsed time evaluation

In [57]:
import time

vector_list = readVectorsSeq('test-datasets/vecs-50-10000.txt')
k = 10 

t0 = time.time()
C = kcenter(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-50000.txt')

t0 = time.time()
C = kcenter(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-100000.txt')

t0 = time.time()
C = kcenter(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-500000.txt')

t0 = time.time()
C = kcenter(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

Number of points: 9960
Elapsed Time first set : 0.45621395111083984 s
Number of points: 50047
Elapsed Time first set : 2.209425449371338 s
Number of points: 99670
Elapsed Time first set : 4.655384063720703 s
Number of points: 499950
Elapsed Time first set : 20.208259105682373 s


In [58]:
def kcenter_GC(P, k):
    P_minus_S = [p for p in P]
    idx_rnd = random.randint(0, len(P)-1)
    S = [P[idx_rnd]]
    P_minus_S.pop(idx_rnd)
    related_center_idx = [idx_rnd for i in range(len(P))]
    dist_near_center = [Vectors.squared_distance(P[i], S[0]) for i in range(len(P))]

    for i in range(k-1):    
        new_center_idx = max(enumerate(dist_near_center), key=lambda x: x[1])[0] # argmax operation

        S.append(P[new_center_idx])
        P_minus_S.remove(P[new_center_idx])

        for j in range(len(P)):
            if j != new_center_idx:
                dist = Vectors.squared_distance(P[j], S[-1])
                if dist < dist_near_center[j]:
                    dist_near_center[j] = dist
                    related_center_idx[j] = new_center_idx
            else:
                dist_near_center[j] = 0
                related_center_idx[j] = new_center_idx
    return S

In [59]:
vector_list = readVectorsSeq('test-datasets/vecs-50-10000.txt')
k = 10 

t0 = time.time()
C = kcenter_GC(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-50000.txt')

t0 = time.time()
C = kcenter_GC(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-100000.txt')

t0 = time.time()
C = kcenter_GC(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

vector_list = readVectorsSeq('test-datasets/vecs-50-500000.txt')

t0 = time.time()
C = kcenter_GC(vector_list,k)
t1 = time.time()

print('Number of points:', len(vector_list))
print('Elapsed Time first set :', t1-t0, 's')

Number of points: 9960
Elapsed Time first set : 0.7605226039886475 s
Number of points: 50047
Elapsed Time first set : 3.2561631202697754 s
Number of points: 99670
Elapsed Time first set : 6.447360515594482 s
Number of points: 499950
Elapsed Time first set : 34.19487452507019 s
