In [1]:
# Make necessary imports
from pyspark import SparkConf
from pyspark.context import SparkContext

In [11]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [49]:
class MRAlgorithm:
    def __init__(self, datapath):
        self.sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
        self.input = self.sc.textFile(datapath).map(lambda x: (int(x.split(',')[0]),
                                                               int(x.split(',')[1]),int(x.split(',')[2])))
        
    def compute_W(self):
        print("Compute W")
        
    def compute_H(self):
        print("Compute H")
        
        # Define dummy H to continue with next operations
        self.H = self.sc.parallelize([(1, [0.2,0.3,0.5]), (2, [0.1,0.7,0.2]), (3, [0.4,0.3,0.3]), (4, [0.5,0.1,0.4])])
        
    def assign_clusters(self):
        #M10: Map user to cluster with highest probability
        self.clusters = self.H.map(lambda x: (x[0], x[1].index(max(x[1]))))
        
        #M11: Emit a 1 for each user that is in a cluster
        self.clustersizes = self.clusters.map(lambda x: (x[1], 1))
        
        #R11: Count the number of users per cluster
        self.clustersizes = self.clustersizes.reduceByKey(lambda x,y: x+y)
        
    def RM2_distribution(self):
        #M12: Emit each rating that a user gave
        self.ratings = self.input.map(lambda x: (x[1], x[2]))
        
        #R12: Sum ratings given by user
        self.ratings = self.ratings.reduceByKey(lambda x,y: x+y)
        
        #Accumulator containing total count of ratings
        globalcount = self.sc.accumulator(0)
        self.ratings.foreach(lambda x: globalcount.add(x[1]))
        
        #M13: Emit each rating that an item received
        self.items = self.input.map(lambda x: (x[0], x[2]))
        
        #Broadcast the total count of ratings, to be used in next reduce
        globalcount = self.sc.broadcast(globalcount.value)
        
        #R13: Compute the probability of item in the collection
        self.items = self.items.reduceByKey(lambda x,y: x+y)
        
        #Divide previous values by the total count of ratings, to normalize probabilities
        self.items = self.items.map(lambda x: (x[0], x[1]/globalcount.value))
        
        #Broadcast all users and which cluster they are in
        clusters = self.sc.broadcast(self.clusters.collectAsMap())
        
        #M14-a: Map each rating from input to the cluster assigned to user
        self.input2cluster = self.input.map(lambda x: (clusters[x[1]], (x[1], x[0], x[2])))
        
        #M14-b: Map sum of ratings of a user to their assigned cluster
        self.ratings2cluster = self.ratings.map(lambda x: (clusters[x[0]], (x[0], x[1])))
        
        #Repartition join these two maps by cluster
        self.fullClusters = self.input2cluster.join(self.ratings2cluster)
        
        #Broadcast sizes of the clusters
        clustersizes = self.sc.broadcast(self.clustersizes.collectAsMap())
        
        #Broadcast probabilities of each item
        items = self.sc.broadcast(self.items.collectAsMap())
        
        #Final reduce combines all information from above, needs W and H

In [50]:
Algorithm = MRAlgorithm('../data/small_dataset.txt')
Algorithm.compute_H()
Algorithm.assign_clusters()
Algorithm.RM2_distribution()

Compute H


In [51]:
Algorithm.input.take(10)

[(1, 1488844, 3),
 (1, 822109, 5),
 (1, 885013, 4),
 (1, 30878, 4),
 (1, 823519, 3),
 (1, 893988, 3),
 (1, 124105, 4),
 (1, 1248029, 3),
 (1, 1842128, 4),
 (1, 2238063, 3)]

In [52]:
Algorithm.clusters.collectAsMap()

{1: 2, 2: 1, 3: 0, 4: 0}

In [53]:
Algorithm.clustersizes.collect()

[(0, 2), (1, 1), (2, 1)]

In [54]:
Algorithm.ratings.take(5)

[(1488844, 9), (30878, 8), (893988, 3), (1842128, 4), (2207774, 5)]

In [55]:
Algorithm.items.take(5)

[(2, 0.0031263442977540004),
 (4, 0.002356875836872686),
 (6, 0.019042829705117875),
 (8, 0.2881568503898842),
 (10, 0.004798574968645675)]