In [1]:
import os
from pyspark import SparkConf, SparkContext

In [2]:
import math
from itertools import islice

# Output : (movie_id, (user_id, rating))
def readData(lines):
    output = []
    for perLine in lines.splitlines():
        splitedItems = perLine.split(',')
        if (splitedItems[0] == 'userId'): continue
        output.append((int(splitedItems[1]), (int(splitedItems[0]), float(splitedItems[2]))))

    return output


# Output: (movie_id, consineBottom)
def calConsineBottom(data):
    ratingSum = 0
    for userRatingTriple in data[1]:
        userRating = userRatingTriple[1]
        ratingSum = ratingSum + pow(userRating, 2)
    squaredRatingSum = math.sqrt(ratingSum)
    return (data[0], squaredRatingSum)
        
# Output: (user_id, [(movie_id, user_rating)])
def convertedToUserKey(data):
    expandedData = []
    movieId = data[0]
    for userRatingTriple in data[1]:
        userId = userRatingTriple[0]
        userRating = userRatingTriple[1]
        expandedData.append((userId, [(movieId, userRating)]))
    return expandedData


conf = SparkConf().setMaster("local").setAppName("recommendation").set("spark.default.parallelism", 4).set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')
sc = SparkContext.getOrCreate(conf=conf)
originalData = sc.textFile("./ratings-small.csv").flatMap(readData)

# Output: (movie_id, movie_id_avg_rating)
movieAvgs = originalData.map(lambda data: (data[0], (1, data[1][1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda data: (data[0], data[1][1]/data[1][0]))

# After join, reducer do "some_rating - avg_rating"
# Output: (movie_id, [(user_id, modified_rating)])
joinedData = originalData.join(movieAvgs).map(lambda data: (data[0], [(data[1][0][0], (data[1][0][1] - data[1][1]))]))

# Output: (movie_id, [(user_id1, modified_rating1), (user_id2, modified_rating2), ...])
joinedData = joinedData.reduceByKey(lambda x, y: x+y)

consineBottom = joinedData.map(calConsineBottom)
# Output: (user_id, [(movie_id1, user_rating1), (movie_id2, user_rating2)])
convertedToUserKeyData = joinedData.flatMap(convertedToUserKey).reduceByKey(lambda x, y: x + y)

convertedToUserKeyData.collect()


[(8, [(4, 0.6000000000000001), (6, -0.6000000000000001)]),
 (4, [(5, -0.3333333333333335), (2, 0.8333333333333335), (3, -2.0)]),
 (12, [(5, 1.6666666666666665), (2, -0.16666666666666652)]),
 (5, [(4, 1.6), (5, 0.6666666666666665), (6, 0.3999999999999999), (3, -1.0)]),
 (1, [(1, -2.6), (6, -1.6), (3, -1.0)]),
 (9, [(1, 1.4), (3, 1.0)]),
 (2, [(4, -1.4), (3, 1.0)]),
 (6, [(1, 1.4), (5, -1.3333333333333335)]),
 (10, [(2, -1.1666666666666665), (3, 0.0)]),
 (3,
  [(4, 0.6000000000000001),
   (1, -0.6000000000000001),
   (5, 0.6666666666666665),
   (6, 0.3999999999999999),
   (2, 1.8333333333333335)]),
 (11,
  [(4, -1.4),
   (1, 0.3999999999999999),
   (5, -1.3333333333333335),
   (6, 1.4),
   (2, -2.1666666666666665),
   (3, 2.0)]),
 (7, [(2, 0.8333333333333335), (3, 0.0)])]

In [4]:
# Output: ((movie_id1, movie_id2), some_rating)
def moviePairMapping(data):
    exportedData = []
    for OuterMovieRatingPair in data[1]:
        for innerMovieRatingPair in data[1]:
            firstMovieId = OuterMovieRatingPair[0]
            secondMovieId = innerMovieRatingPair[0]            
            if (firstMovieId >= secondMovieId): continue

            firstMovieUserRating = OuterMovieRatingPair[1]
            secondMovieUserRating = innerMovieRatingPair[1]
            productRating = firstMovieUserRating*secondMovieUserRating

            exportedData.append(((firstMovieId, secondMovieId), productRating))

    return exportedData

consineTop = convertedToUserKeyData.flatMap(moviePairMapping).reduceByKey(lambda x, y: x + y)
consineTop.collect()

[((4, 6), -1.44),
 ((1, 5), -2.8),
 ((2, 4), 4.133333333333333),
 ((2, 3), -6.0),
 ((4, 5), 3.333333333333333),
 ((5, 6), -1.3333333333333335),
 ((3, 4), -5.8),
 ((1, 6), 4.48),
 ((1, 2), -1.9666666666666668),
 ((3, 5), -2.6666666666666665),
 ((1, 3), 4.8),
 ((2, 6), -2.2999999999999994),
 ((2, 5), 3.5555555555555554),
 ((3, 6), 4.0),
 ((1, 4), -0.9199999999999999)]

In [5]:
# sim_pair: ((sim_movie_id1, sim_movie_id2), similarity)

consineTop = consineTop.filter(lambda data: data[1] != 0)

# Output: (sim_movie1, ((sim_pair), sim_movie1_consineBottom)), (sim_movie2, ((sim_pair), sim_movie2_consineBottom))
coleasedConsine = consineTop.flatMap(lambda data: [(data[0][0], data), (data[0][1], data)]).join(consineBottom)

# Output: ((sim_pair), sim_movie1_consineBottom), ((sim_pair), sim_movie2_consineBottom) =>  ((sim_pair), sim_movie1_consineBottom*sim_movie2_consineBottom)
finalResult = coleasedConsine.map(lambda data: (data[1][0], data[1][1])).reduceByKey(lambda x, y: x*y)

# Output: ((sim_movie_id1, sim_movie_id2), consineTop/consineBottom)
finalResult = finalResult.map(lambda data: (data[0][0], data[0][1]/data[1]))

finalResultList = finalResult.collect()


In [6]:
finalResultList[:10]
file1 = open("basic_output_small.txt","w") 
for finalResult in finalResultList:
    file1.write("(%d, %d), %.6f\n"%(finalResult[0][0], finalResult[0][1], finalResult[1])) 
file1.close() #to change file access modes

In [7]:
# Output: ((movie_id1, movie_id2), sim)
def bonusRead(lines):
    output = []
    for perLine in lines.splitlines():
        splitedItems = perLine.split(',')
        sim = float(splitedItems[2][1:])
        if sim <= 0: continue
        output.append(((int(splitedItems[0][1:]), int(splitedItems[1][1:-1])), sim))
    return output

def rankingSim(data):
    simMovieList = data[1]
    sortedSimMovieList = sorted(simMovieList, key = lambda data: data[1], reverse=True)
    return (data[0], sortedSimMovieList)
    

bonusData = sc.textFile("./basic_output_small.txt").flatMap(bonusRead)

# Output: (movie_id, [(sim_movie_id1, sim1), (sim_movie_id2, sim2), ...])
movieSims = bonusData.flatMap(lambda data: [(data[0][0], [(data[0][1], data[1])]), (data[0][1], [(data[0][0], data[1])])]).reduceByKey(lambda x, y: x+y).map(rankingSim)

# Output: (user_id, [(rated_movie_id1, rating1), (rated_movie_id2, rating2), ...])
userMoviesRating = originalData.map(lambda data: (data[1][0], [(data[0], data[1][1])])).reduceByKey(lambda x, y: x+y)

allMovieIdList = originalData.map(lambda data: data[0]).distinct().collect()
movieSimsList = movieSims.collect()

userMoviesRating.collect()

[(4, [(2, 4.0), (3, 1.0), (5, 3.0)]),
 (8, [(4, 4.0), (6, 2.0)]),
 (12, [(2, 3.0), (5, 5.0)]),
 (1, [(1, 1.0), (3, 2.0), (6, 1.0)]),
 (5, [(3, 2.0), (4, 5.0), (5, 4.0), (6, 3.0)]),
 (9, [(1, 5.0), (3, 4.0)]),
 (2, [(3, 4.0), (4, 2.0)]),
 (6, [(1, 5.0), (5, 2.0)]),
 (10, [(2, 2.0), (3, 3.0)]),
 (3, [(1, 3.0), (2, 5.0), (4, 4.0), (5, 4.0), (6, 3.0)]),
 (7, [(2, 4.0), (3, 3.0)]),
 (11, [(1, 4.0), (2, 1.0), (3, 5.0), (4, 2.0), (5, 2.0), (6, 4.0)])]

In [8]:
def predictMovieRating(data, allMovieIdList, movieSimsList):
    ratedMovieIdList = [ pair[0] for pair in data[1] ] # Pure ids
    missingRatedMovieIds = [item for item in allMovieIdList if item not in ratedMovieIdList] # Pure ids
    movieSimsDict = dict(movieSimsList) # { movie_id: [(sim_movie_id, sim), ...]}
    ratedMovieDict = dict(data[1]) # { movie_id: rating }
    
    finalResult = []

    for missingMovieId in missingRatedMovieIds:
        if (missingMovieId not in movieSimsDict): continue # If some movie has no any relationship with others (such as negative relationship we preprocess before), we don't need to calculate for them
        highSimMovies = movieSimsDict[missingMovieId]
        accumulateSim = 0 # Lower
        accumulateRating = 0 # Upper
        count = 0
        for highSimMovie in highSimMovies:
            highSimMovieId = highSimMovie[0]
            highSimMovieSimilarity = highSimMovie[1]
            if (highSimMovieId in ratedMovieDict):
                # RATED BEFORE
                accumulateRating = accumulateRating + ratedMovieDict[highSimMovieId]*highSimMovieSimilarity
                accumulateSim = accumulateSim + highSimMovieSimilarity
                count = count + 1
            if (count >= 10): break
        # If all the ratings the user predicted don't match any highly similiar movies' rating, we don't predict the rating for him
        if (accumulateRating != 0): finalResult.append(((data[0], missingMovieId), (accumulateRating/accumulateSim)))
    return finalResult

bonusResult = userMoviesRating.flatMap(lambda x: predictMovieRating(x, allMovieIdList, movieSimsList))

bonusFinalResult = bonusResult.collect()

bonusFinalResult

[((4, 4), 3.5050030051481373),
 ((4, 1), 1.0),
 ((4, 6), 1.0),
 ((8, 1), 2.0),
 ((8, 5), 4.0),
 ((8, 2), 4.0),
 ((8, 3), 2.0),
 ((12, 4), 3.989993989703726),
 ((5, 1), 2.5864072665593825),
 ((5, 2), 4.539852050768296),
 ((9, 6), 4.5368891815512935),
 ((2, 1), 4.0),
 ((2, 5), 2.0),
 ((2, 6), 4.0),
 ((2, 2), 2.0),
 ((6, 4), 2.0),
 ((6, 6), 5.0),
 ((6, 2), 2.0),
 ((6, 3), 5.0),
 ((10, 4), 2.0),
 ((10, 1), 3.0),
 ((10, 5), 2.0),
 ((10, 6), 3.0),
 ((3, 3), 2.9999999999999996),
 ((7, 4), 4.0),
 ((7, 1), 3.0),
 ((7, 5), 4.0),
 ((7, 6), 3.0)]

In [9]:
file1 = open("bonus_output_small.txt","w") 
for finalResult in bonusFinalResult:
    file1.write("(%d, %d), %.2f\n"%(finalResult[0][0], finalResult[0][1], finalResult[1])) 
file1.close() #to change file access modes