# Homework 4 - RL #

### Import packages used throughout the program ###

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors
import numpy as np
import random
import time
import itertools

### Homework 3 functions ###

In [2]:
def f(string):
    return Vectors.dense([float(coord) for coord in string.split(' ')])

In [3]:
def farthest_first_traversal(P, k):
    """Return a list of the k farthest points.
    
    Input:  P - a list of vectors, representing points.
            k - a scalar, number of target centroids.
    Output: S - a list of vectors, representing the centers.
    """
    idx_rnd = random.randint(0, len(P)-1)
    S = [P[idx_rnd]]
    related_center_idx = [idx_rnd for i in range(len(P))]
    dist_near_center = [Vectors.squared_distance(P[i], S[0]) for i in range(len(P))]

    for i in range(k-1):    
        new_center_idx = np.argmax(dist_near_center)

        S.append(P[new_center_idx])

        for j in range(len(P)):
            if j != new_center_idx:
                dist = Vectors.squared_distance(P[j], S[-1])
                if dist < dist_near_center[j]:
                    dist_near_center[j] = dist
                    related_center_idx[j] = new_center_idx
            else:
                dist_near_center[j] = 0
                related_center_idx[j] = new_center_idx
    return S

### Homework 4 functions ###

In [4]:
# points is a list of Vectors , k an integer
def runSequential(points, k):

    n = len(points)
    if k >= n:
        return points

    result = list()
    candidates = np.full(n , True)

    for iter in range(int(k / 2)):
        maxDist = 0.0
        maxI = 0
        maxJ = 0
        for i in range(n):
            if candidates[i] == True:
                for j in range(n):
                    d = Vectors.squared_distance(points[i], points[j])
                    if d > maxDist:
                        maxDist = d
                        maxI = i
                        maxJ = j
        result.append( points[maxI] )
        result.append( points[maxJ] )
        #print "selecting "+str(maxI)+" and "+str(maxJ)
        candidates[maxI] = False
        candidates[maxJ] = False

    if k % 2 != 0:
        s = np.random.randint(n)
        for i in range(n):
            j = (i + s) % n
            if candidates[j] == True:
                #print "selecting "+str(j)
                result.append( points[i] )
                break

    return result

input_length = 100
k = 11
d = 3
m = 10

test = list()
for i in range(input_length):
    test.append( Vectors.dense([ np.random.randint(m) for i in range(d) ]) )

#print test
print("Input of size ", str(len(test)), " with k = ", str(k))
print("Selected: ", str(len(runSequential(test, k))))

Input of size  100  with k =  11
Selected:  11


In [17]:
def runMapReduce(pointsrdd, k, numBlocks):
    
    partitions_list = pointsrdd.glom().mapPartitions(lambda x: list(x)).collect()
    
    centers_list = []
    for i in range(numBlocks):
        centers_list.append(farthest_first_traversal(partitions_list[i], k))
    
    coreset = list(itertools.chain.from_iterable(centers_list)) 
    
    return runSequential(coreset, k)

In [20]:
def measure(pointslist):
    
    d = 0.
    N = len(pointslist)
    for i in range(N):
        for j in range(i):
            d += Vectors.squared_distance(pointslist[i],pointslist[j])
    return 2 * d / (N * (N-1))

### main program ###

In [6]:
# spark setup
conf = SparkConf().setAppName('HW4').setMaster('local')
sc = SparkContext(conf=conf)

In [18]:
datafile = "test-datasets/vecs-50-10000.txt"
numBlocks = 4
k = 2

# read file into RDD
inputrdd = sc.textFile(datafile) \
             .map(f) \
             .repartition(numBlocks) \
             .cache()
print("Number of vectors in RDD: ", inputrdd.count())

# solve diversity maximization problem
centers = runMapReduce(inputrdd, k, numBlocks)

Number of vectors in RDD:  9960


In [19]:
print(centers)

[DenseVector([1.7341, -0.1707, -0.8984, 0.146, 0.964, 0.9647, -0.6091, -0.5369, 0.0491, -0.8059, 0.8495, 1.018, 1.3986, 0.1924, 0.7402, 0.1298, 0.1661, 0.1978, -1.1801, 0.2801, -1.7753, -1.9853, 1.0203, 0.7061, 0.2972, -0.3251, -0.7344, -0.0868, -0.362, 0.6731, 1.355, -2.0711, 0.3869, 0.008, 0.4489, -0.5733, -0.4192, -0.2218, -0.6991, -0.3281, -0.8162, -1.0147, 0.3826, 1.1174, 0.0809, 0.3916, -0.3399, -0.9178, -0.2112, -0.6982]), DenseVector([-0.1434, -0.0787, 0.197, 0.798, 0.1972, 1.2935, 0.3461, -0.3515, -1.1286, -0.1181, 0.72, 0.3255, -0.4646, 0.4922, 0.3492, 0.0196, 0.0946, 1.2648, -0.4874, 0.455, 0.3234, -0.0297, -0.8326, -0.9728, 1.3394, 0.5714, 1.7682, -0.1891, -0.5561, -0.7951, -0.76, -0.4335, 0.1538, 0.0521, -0.0229, -0.0343, -0.0234, 0.0391, 0.0093, -0.407, -0.9199, -1.0007, 0.4498, -1.5818, 1.7922, -0.0502, 0.9155, 0.998, 0.7148, 0.1417])]


In [21]:
result = measure(centers)

In [23]:
print(result)

65.0677326240699
