In [1]:
import os
from pyspark import SparkConf, SparkContext

In [2]:
import math
from itertools import islice

# Output : (movie_id, (user_id, rating))
def readData(lines):
    output = []
    for perLine in lines.splitlines():
        splitedItems = perLine.split(',')
        if (splitedItems[0] == 'userId'): continue
        output.append((int(splitedItems[1]), (int(splitedItems[0]), float(splitedItems[2]))))

    return output


# Output: (movie_id, consineBottom)
def calConsineBottom(data):
    ratingSum = 0
    for userRatingTriple in data[1]:
        userRating = userRatingTriple[1]
        ratingSum = ratingSum + pow(userRating, 2)
    squaredRatingSum = math.sqrt(ratingSum)
    return (data[0], squaredRatingSum)
        
# Output: (user_id, [(movie_id, user_rating)])
def convertedToUserKey(data):
    expandedData = []
    movieId = data[0]
    for userRatingTriple in data[1]:
        userId = userRatingTriple[0]
        userRating = userRatingTriple[1]
        expandedData.append((userId, [(movieId, userRating)]))
    return expandedData


conf = SparkConf().setMaster("local").setAppName("recommendation").set("spark.default.parallelism", 4).set('spark.driver.memory', '45G').set('spark.driver.maxResultSize', '10G')
sc = SparkContext.getOrCreate(conf=conf)
originalData = sc.textFile("./ratings.csv").flatMap(readData)

# Output: (movie_id, movie_id_avg_rating)
movieAvgs = originalData.map(lambda data: (data[0], (1, data[1][1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])).map(lambda data: (data[0], data[1][1]/data[1][0]))

# After join, reducer do "some_rating - avg_rating"
# Output: (movie_id, [(user_id, modified_rating)])
joinedData = originalData.join(movieAvgs).map(lambda data: (data[0], [(data[1][0][0], (data[1][0][1] - data[1][1]))]))

# Output: (movie_id, [(user_id1, modified_rating1), (user_id2, modified_rating2), ...])
joinedData = joinedData.reduceByKey(lambda x, y: x+y)

consineBottom = joinedData.map(calConsineBottom)
# Output: (user_id, [(movie_id1, user_rating1), (movie_id2, user_rating2)])
convertedToUserKeyData = joinedData.flatMap(convertedToUserKey).reduceByKey(lambda x, y: x + y)

convertedToUserKeyData.collect()


[(40,
  [(216, -2.326530612244898),
   (296, 0.8029315960912049),
   (316, -0.375),
   (592, 0.5714285714285716),
   (588, 0.2076502732240435),
   (300, 1.4814814814814814),
   (344, 0.9596273291925468),
   (364, 1.058139534883721),
   (380, -0.4971910112359552),
   (60, 0.7647058823529411),
   (104, -2.4393939393939394),
   (168, -1.0833333333333335),
   (208, -0.9130434782608696),
   (224, 0.3875000000000002),
   (236, 0.5222222222222221),
   (252, 0.5465116279069768),
   (256, -0.8285714285714287),
   (292, -0.4257425742574257),
   (312, 1.5555555555555556),
   (432, 0.3545454545454545),
   (440, 0.45394736842105265),
   (616, -0.411111111111111),
   (44, 0.4565217391304346),
   (376, -0.25),
   (172, 1.3207547169811322),
   (48, -1.1470588235294117),
   (272, 1.2419354838709675),
   (308, 0.9749999999999996),
   (228, 0.6666666666666665),
   (1, 1.0790697674418603),
   (457, -0.9921052631578946),
   (553, 1.1846153846153844),
   (21, 0.50561797752809),
   (265, 1.0606060606060606),

In [3]:
# Output: ((movie_id1, movie_id2), some_rating)
def moviePairMapping(data):
    exportedData = []
    for OuterMovieRatingPair in data[1]:
        for innerMovieRatingPair in data[1]:
            firstMovieId = OuterMovieRatingPair[0]
            secondMovieId = innerMovieRatingPair[0]            
            if (firstMovieId >= secondMovieId): continue

            firstMovieUserRating = OuterMovieRatingPair[1]
            secondMovieUserRating = innerMovieRatingPair[1]
            productRating = firstMovieUserRating*secondMovieUserRating

            exportedData.append(((firstMovieId, secondMovieId), productRating))

    return exportedData

consineTop = convertedToUserKeyData.flatMap(moviePairMapping).reduceByKey(lambda x, y: x + y)
consineTop.collect()

[((60, 230), 4.221813725490197),
 ((104, 514), -0.21969696969697017),
 ((172, 330), 0.5),
 ((172, 514), -0.43317610062892997),
 ((205, 421), -0.10416666666666685),
 ((273, 317), 2.00297619047619),
 ((281, 685), 1.2142857142857144),
 ((317, 609), 1.4497354497354489),
 ((126, 344), 0.9192546583850936),
 ((34, 376), 0.5654296875),
 ((146, 300), -0.49382716049382736),
 ((230, 272), 2.122983870967741),
 ((514, 616), -1.0046296296296298),
 ((223, 371), 4.583791208791209),
 ((19, 31), -3.1136363636363638),
 ((327, 339), 1.7797118847539015),
 ((339, 491), -1.1302521008403354),
 ((107, 351), 3.877828054298643),
 ((107, 159), 0.36401098901098916),
 ((107, 339), 0.9874411302982731),
 ((107, 371), 0.5659340659340659),
 ((216, 2470), 3.041353383458646),
 ((216, 7458), 0.31324404761904767),
 ((216, 442), 2.1309523809523805),
 ((216, 2746), 2.6496598639455793),
 ((216, 4270), 1.9151927437641723),
 ((216, 41566), -0.7514812376563541),
 ((216, 30810), 3.125206839492553),
 ((260, 1258), 2.33110676559815

In [4]:
# sim_pair: ((sim_movie_id1, sim_movie_id2), similarity)

consineTop = consineTop.filter(lambda data: data[1] != 0)

# Output: (sim_movie1, ((sim_pair), sim_movie1_consineBottom)), (sim_movie2, ((sim_pair), sim_movie2_consineBottom))
coleasedConsine = consineTop.flatMap(lambda data: [(data[0][0], data), (data[0][1], data)]).join(consineBottom)

# Output: ((sim_pair), sim_movie1_consineBottom), ((sim_pair), sim_movie2_consineBottom) =>  ((sim_pair), sim_movie1_consineBottom*sim_movie2_consineBottom)
finalResult = coleasedConsine.map(lambda data: (data[1][0], data[1][1])).reduceByKey(lambda x, y: x*y)

# Output: ((sim_movie_id1, sim_movie_id2), consineTop/consineBottom)
finalResult = finalResult.map(lambda data: (data[0][0], data[0][1]/data[1]))

finalResultList = finalResult.collect()


In [7]:
finalResultList[:10]
file1 = open("basic_output.txt","w") 
for finalResult in finalResultList:
    file1.write("(%d, %d), %.6f\n"%(finalResult[0][0], finalResult[0][1], finalResult[1])) 
file1.close() #to change file access modes

In [8]:
# Output: ((movie_id1, movie_id2), sim)
def bonusRead(lines):
    output = []
    for perLine in lines.splitlines():
        splitedItems = perLine.split(',')
        sim = float(splitedItems[2][1:])
        if sim <= 0: continue
        output.append(((int(splitedItems[0][1:]), int(splitedItems[1][1:-1])), sim))
    return output

def rankingSim(data):
    simMovieList = data[1]
    sortedSimMovieList = sorted(simMovieList, key = lambda data: data[1], reverse=True)
    return (data[0], sortedSimMovieList)
    

bonusData = sc.textFile("./basic_output.txt").flatMap(bonusRead)

# Output: (movie_id, [(sim_movie_id1, sim1), (sim_movie_id2, sim2), ...])
movieSims = bonusData.flatMap(lambda data: [(data[0][0], [(data[0][1], data[1])]), (data[0][1], [(data[0][0], data[1])])]).reduceByKey(lambda x, y: x+y).map(rankingSim)

# Output: (user_id, [(rated_movie_id1, rating1), (rated_movie_id2, rating2), ...])
userMoviesRating = originalData.map(lambda data: (data[1][0], [(data[0], data[1][1])])).reduceByKey(lambda x, y: x+y)

allMovieIdList = originalData.map(lambda data: data[0]).distinct().collect()
movieSimsList = movieSims.collect()

userMoviesRating.collect()

[(4,
  [(21, 3.0),
   (32, 2.0),
   (45, 3.0),
   (47, 2.0),
   (52, 3.0),
   (58, 3.0),
   (106, 4.0),
   (125, 5.0),
   (126, 1.0),
   (162, 5.0),
   (171, 3.0),
   (176, 5.0),
   (190, 2.0),
   (215, 5.0),
   (222, 1.0),
   (232, 5.0),
   (235, 2.0),
   (247, 3.0),
   (260, 5.0),
   (265, 5.0),
   (296, 1.0),
   (319, 5.0),
   (342, 5.0),
   (345, 4.0),
   (348, 4.0),
   (351, 3.0),
   (357, 3.0),
   (368, 4.0),
   (417, 2.0),
   (441, 1.0),
   (450, 2.0),
   (457, 5.0),
   (475, 5.0),
   (492, 5.0),
   (509, 1.0),
   (538, 5.0),
   (539, 1.0),
   (553, 2.0),
   (588, 4.0),
   (593, 5.0),
   (595, 3.0),
   (599, 2.0),
   (608, 5.0),
   (648, 3.0),
   (708, 4.0),
   (759, 3.0),
   (800, 4.0),
   (892, 4.0),
   (898, 5.0),
   (899, 4.0),
   (902, 4.0),
   (904, 4.0),
   (908, 5.0),
   (910, 5.0),
   (912, 5.0),
   (914, 5.0),
   (919, 5.0),
   (920, 5.0),
   (930, 5.0),
   (937, 3.0),
   (1025, 4.0),
   (1046, 5.0),
   (1057, 3.0),
   (1060, 2.0),
   (1073, 4.0),
   (1077, 5.0),
   (1

In [9]:
def predictMovieRating(data, allMovieIdList, movieSimsList):
    ratedMovieIdList = [ pair[0] for pair in data[1] ] # Pure ids
    missingRatedMovieIds = [item for item in allMovieIdList if item not in ratedMovieIdList] # Pure ids
    movieSimsDict = dict(movieSimsList) # { movie_id: [(sim_movie_id, sim), ...]}
    ratedMovieDict = dict(data[1]) # { movie_id: rating }
    
    finalResult = []

    for missingMovieId in missingRatedMovieIds:
        if (missingMovieId not in movieSimsDict): continue # If some movie has no any relationship with others (such as negative relationship we preprocess before), we don't need to calculate for them
        highSimMovies = movieSimsDict[missingMovieId]
        accumulateSim = 0 # Lower
        accumulateRating = 0 # Upper
        count = 0
        for highSimMovie in highSimMovies:
            highSimMovieId = highSimMovie[0]
            highSimMovieSimilarity = highSimMovie[1]
            if (highSimMovieId in ratedMovieDict):
                # RATED BEFORE
                accumulateRating = accumulateRating + ratedMovieDict[highSimMovieId]*highSimMovieSimilarity
                accumulateSim = accumulateSim + highSimMovieSimilarity
                count = count + 1
            if (count >= 10): break
        # If all the ratings the user predicted don't match any highly similiar movies' rating, we don't predict the rating for him
        if (accumulateRating != 0): finalResult.append(((data[0], missingMovieId), (accumulateRating/accumulateSim)))
    return finalResult

bonusResult = userMoviesRating.flatMap(lambda x: predictMovieRating(x, allMovieIdList, movieSimsList))

bonusFinalResult = bonusResult.collect()

bonusFinalResult

[((200, 216), 3.466506570298972),
 ((200, 316), 3.623181243487894),
 ((200, 552), 4.225043904316109),
 ((200, 592), 3.7276614020579792),
 ((200, 596), 3.478039579116705),
 ((200, 608), 4.0715460137901385),
 ((200, 804), 3.4951595376353066),
 ((200, 940), 3.928306153478798),
 ((200, 1024), 3.5430633036741748),
 ((200, 1032), 3.25235049207246),
 ((200, 1060), 3.6037041205436093),
 ((200, 1080), 4.153129586388532),
 ((200, 1092), 3.751171803151688),
 ((200, 1136), 3.8374278421099555),
 ((200, 1208), 3.282766225358388),
 ((200, 1224), 4.095719648483605),
 ((200, 1240), 3.844832077552504),
 ((200, 1256), 4.1443384750004),
 ((200, 1348), 3.657893945615323),
 ((200, 1396), 3.8291433710843137),
 ((200, 1408), 3.2759290093443765),
 ((200, 1552), 3.3546622308384553),
 ((200, 1620), 3.7373616038017),
 ((200, 1644), 3.919714856235803),
 ((200, 1732), 3.3491369355595255),
 ((200, 1804), 3.8063063334084823),
 ((200, 1920), 3.8316773330874927),
 ((200, 2000), 3.8858969558288154),
 ((200, 2028), 4.193

In [10]:
file1 = open("bonus_output.txt","w") 
for finalResult in bonusFinalResult:
    file1.write("(%d, %d), %.2f\n"%(finalResult[0][0], finalResult[0][1], finalResult[1])) 
file1.close() #to change file access modes