# Homework 4 - RL #

### Import packages used throughout the program ###

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
import numpy as np
import random
import time
import itertools

### Homework 3 functions ###

In [2]:
def f(string):
    return Vectors.dense([float(coord) for coord in string.split(' ')])

In [3]:
def farthest_first_traversal(P, k):
    """Return a list of the k farthest points.
    
    Input:  P - a list of vectors, representing points.
            k - a scalar, number of target centroids.
    Output: S - a list of vectors, representing the centers.
    """
    idx_rnd = random.randint(0, len(P)-1)
    S = [P[idx_rnd]]
    related_center_idx = [idx_rnd for i in range(len(P))]
    dist_near_center = [Vectors.squared_distance(P[i], S[0]) for i in range(len(P))]

    for i in range(k-1):    
        new_center_idx = np.argmax(dist_near_center)

        S.append(P[new_center_idx])

        for j in range(len(P)):
            if j != new_center_idx:
                dist = Vectors.squared_distance(P[j], S[-1])
                if dist < dist_near_center[j]:
                    dist_near_center[j] = dist
                    related_center_idx[j] = new_center_idx
            else:
                dist_near_center[j] = 0
                related_center_idx[j] = new_center_idx
    return S

### Homework 4 functions ###

In [4]:
# points is a list of Vectors , k an integer
def runSequential(points, k):

    n = len(points)
    if k >= n:
        return points

    result = list()
    candidates = np.full(n , True)

    for iter in range(int(k / 2)):
        maxDist = 0.0
        maxI = 0
        maxJ = 0
        for i in range(n):
            if candidates[i] == True:
                for j in range(n):
                    d = Vectors.squared_distance(points[i], points[j])
                    if d > maxDist:
                        maxDist = d
                        maxI = i
                        maxJ = j
        result.append( points[maxI] )
        result.append( points[maxJ] )
        #print "selecting "+str(maxI)+" and "+str(maxJ)
        candidates[maxI] = False
        candidates[maxJ] = False

    if k % 2 != 0:
        s = np.random.randint(n)
        for i in range(n):
            j = (i + s) % n
            if candidates[j] == True:
                #print "selecting "+str(j)
                result.append( points[i] )
                break

    return result

input_length = 100
k = 11
d = 3
m = 10

test = list()
for i in range(input_length):
    test.append( Vectors.dense([ np.random.randint(m) for i in range(d) ]) )

#print test
print("Input of size ", str(len(test)), " with k = ", str(k))
print("Selected: ", str(len(runSequential(test, k))))

Input of size  100  with k =  11
Selected:  11


In [17]:
def runMapReduce(pointsrdd, k, numBlocks):
    
    partitions_list = pointsrdd.glom().mapPartitions(lambda x: list(x)).collect()
    
    centers_list = []
    for i in range(numBlocks):
        centers_list.append(farthest_first_traversal(partitions_list[i], k))
    
    coreset = list(itertools.chain.from_iterable(centers_list)) 
    
    return runSequential(coreset, k)

In [24]:
def measure(pointslist):
    
    d = 0.
    N = len(pointslist)
    for i in range(N):
        for j in range(i):
            d += np.sqrt(Vectors.squared_distance(pointslist[i],pointslist[j]))
    return 2 * d / (N * (N-1))

### main program ###

In [6]:
# spark setup
conf = SparkConf().setAppName('HW4').setMaster('local')
sc = SparkContext(conf=conf)

In [25]:
datafile = "test-datasets/vecs-50-10000.txt"
numBlocks = 4
k = 2

# read file into RDD
inputrdd = sc.textFile(datafile) \
             .map(f) \
             .repartition(numBlocks) \
             .cache()
print("Number of vectors in RDD: ", inputrdd.count())

# solve diversity maximization problem
centers = runMapReduce(inputrdd, k, numBlocks)

Number of vectors in RDD:  9960


In [26]:
print(centers)

[DenseVector([1.5739, 0.0218, -0.9746, 0.4599, 0.8992, 1.0606, -0.5973, -1.367, 0.5991, -1.0749, 0.614, 0.9212, 1.3128, -0.2731, 0.7109, -0.0931, 0.3146, 0.5643, -1.2732, -0.536, -1.8741, -1.9, 1.062, 0.4516, 0.2742, -0.1369, -0.2419, 0.478, -0.0766, 0.3725, 1.4715, -1.4603, 0.0066, 0.0069, 0.0864, -0.4083, -0.5563, -0.3979, -0.7668, -0.4354, -0.4895, -1.0124, 0.4554, 0.5133, 0.9186, 0.3739, -0.0437, -0.7758, 0.0282, -0.2551]), DenseVector([-0.1114, -1.4935, -0.8168, -0.0934, -0.3292, -0.195, 1.0956, -0.2271, 0.4648, -0.1134, -0.6902, -1.4922, 1.1978, -1.6225, -0.528, 0.2493, 0.6901, 1.2245, 0.4057, 0.5572, 0.3419, -0.7044, 0.0581, 1.1519, 0.2402, 0.0152, -0.3314, 0.3877, -0.7477, -0.772, -0.7829, -0.4076, -0.0792, 0.1604, 0.8697, 0.0295, -0.107, 1.0291, -0.5838, 0.6497, -0.2394, 0.7285, -0.3843, -0.3338, -0.7964, 0.5792, 0.1143, -0.0757, 0.1913, -0.2007])]


In [27]:
result = measure(centers)

In [28]:
print(result)

7.577738326355545
