In [20]:
import os
import csv
import sys
import re

In [2]:
from surprise import Dataset
from surprise import Reader

In [3]:
from collections import defaultdict
import numpy as np

In [14]:
class MovieLens:

    movieID_to_name = {}
    name_to_movieID = {}
    
    ratingsPath = 'C:/Users/Laci/anaconda_envs/RecSys/ml-latest-small/ratings.csv'
    moviesPath = 'C:/Users/Laci/anaconda_envs/RecSys/ml-latest-small/movies.csv'
    
    def loadMovieLensLatestSmall(self):
        ratingsDataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}
        
        # userId movieId rating timestamp
        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
                movieReader = csv.reader(csvfile)
                next(movieReader)  #Skip header line
                
                for row in movieReader:
                    movieID = int(row[0])
                    movieName = row[1]
                    self.movieID_to_name[movieID] = movieName
                    self.name_to_movieID[movieName] = movieID

        return ratingsDataset
    

    def getUserRatings(self, user):
        userRatings = []
        hitUser = False
        
        with open(self.ratingsPath, newline='') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            
            for row in ratingReader:
                userID = int(row[0])
                if (user == userID):
                    movieID = int(row[1])
                    rating = float(row[2])
                    userRatings.append((movieID, rating))
                    hitUser = True
                if (hitUser and (user != userID)):
                    break

        return userRatings

    
    def getPopularityRanks(self):
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        
        with open(self.ratingsPath, newline='') as csvfile:
            ratingReader = csv.reader(csvfile)
            next(ratingReader)
            
            for row in ratingReader:
                movieID = int(row[1])
                ratings[movieID] += 1
                
        rank = 1
        for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[movieID] = rank
            rank += 1
            
        return rankings
  

    def getGenres(self):
        genres = defaultdict(list)
        genreIDs = {}
        maxGenreID = 0
        
        with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
            movieReader = csv.reader(csvfile)
            next(movieReader)  #Skip header line
            
            for row in movieReader:
                movieID = int(row[0])
                genreList = row[2].split('|')
                genreIDList = []
                for genre in genreList:
                    if genre in genreIDs:
                        genreID = genreIDs[genre]
                    else:
                        genreID = maxGenreID
                        genreIDs[genre] = genreID
                        maxGenreID += 1
                        
                    genreIDList.append(genreID)
                    
                genres[movieID] = genreIDList
                
        # Convert integer-encoded genre lists to bitfields that we can treat as vectors
        for (movieID, genreIDList) in genres.items():
            bitfield = [0] * maxGenreID
            
            for genreID in genreIDList:
                bitfield[genreID] = 1
                
            genres[movieID] = bitfield            
        
        return genres
    
    
    def getYears(self):
        p = re.compile(r"(?:\((\d{4})\))?\s*$")
        years = defaultdict(int)
        
        with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
            movieReader = csv.reader(csvfile)
            next(movieReader)
            
            for row in movieReader:
                movieID = int(row[0])
                title = row[1]
                m = p.search(title)
                year = m.group(1)
                if year:
                    years[movieID] = int(year)
                    
        return years
    
    
    def getMovieName(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""
        
    def getMovieID(self, movieName):
        if movieName in self.name_to_movieID:
            return self.name_to_movieID[movieName]
        else:
            return 0

In [15]:
ml = MovieLens()

In [16]:
data = ml.loadMovieLensLatestSmall()

In [29]:
ml.movieID_to_name

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (CitÃ© des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (19

In [28]:
ml.name_to_movieID

{'Toy Story (1995)': 1,
 'Jumanji (1995)': 2,
 'Grumpier Old Men (1995)': 3,
 'Waiting to Exhale (1995)': 4,
 'Father of the Bride Part II (1995)': 5,
 'Heat (1995)': 6,
 'Sabrina (1995)': 7,
 'Tom and Huck (1995)': 8,
 'Sudden Death (1995)': 9,
 'GoldenEye (1995)': 10,
 'American President, The (1995)': 11,
 'Dracula: Dead and Loving It (1995)': 12,
 'Balto (1995)': 13,
 'Nixon (1995)': 14,
 'Cutthroat Island (1995)': 15,
 'Casino (1995)': 16,
 'Sense and Sensibility (1995)': 17,
 'Four Rooms (1995)': 18,
 'Ace Ventura: When Nature Calls (1995)': 19,
 'Money Train (1995)': 20,
 'Get Shorty (1995)': 21,
 'Copycat (1995)': 22,
 'Assassins (1995)': 23,
 'Powder (1995)': 24,
 'Leaving Las Vegas (1995)': 25,
 'Othello (1995)': 26,
 'Now and Then (1995)': 27,
 'Persuasion (1995)': 28,
 'City of Lost Children, The (CitÃ© des enfants perdus, La) (1995)': 29,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 30,
 'Dangerous Minds (1995)': 31,
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'

In [24]:
ml.getYears()

defaultdict(int,
            {1: 1995,
             2: 1995,
             3: 1995,
             4: 1995,
             5: 1995,
             6: 1995,
             7: 1995,
             8: 1995,
             9: 1995,
             10: 1995,
             11: 1995,
             12: 1995,
             13: 1995,
             14: 1995,
             15: 1995,
             16: 1995,
             17: 1995,
             18: 1995,
             19: 1995,
             20: 1995,
             21: 1995,
             22: 1995,
             23: 1995,
             24: 1995,
             25: 1995,
             26: 1995,
             27: 1995,
             28: 1995,
             29: 1995,
             30: 1995,
             31: 1995,
             32: 1995,
             34: 1995,
             35: 1995,
             36: 1995,
             37: 1995,
             38: 1995,
             39: 1995,
             40: 1995,
             41: 1995,
             42: 1995,
             43: 1995,
             44: 1995,
   

In [25]:
ml.getGenres()

defaultdict(list,
            {1: [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             2: [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             3: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             4: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             5: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             6: [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             7: [0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             8: [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             9: [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             10: [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             11: [0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             12: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             13: [1, 1, 1, 0, 0, 0, 0, 0, 0

In [31]:
ml.getPopularityRanks()

defaultdict(int,
            {356: 1,
             296: 2,
             318: 3,
             593: 4,
             260: 5,
             480: 6,
             2571: 7,
             1: 8,
             527: 9,
             589: 10,
             1196: 11,
             110: 12,
             1270: 13,
             608: 14,
             2858: 15,
             1198: 16,
             780: 17,
             1210: 18,
             588: 19,
             457: 20,
             590: 21,
             2959: 22,
             47: 23,
             50: 24,
             150: 25,
             364: 26,
             858: 27,
             4993: 28,
             380: 29,
             592: 30,
             32: 31,
             2762: 32,
             2028: 33,
             1580: 34,
             5952: 35,
             377: 36,
             595: 37,
             7153: 38,
             344: 39,
             4306: 40,
             648: 41,
             1265: 42,
             1721: 43,
             1197: 44,
            

In [41]:
# Test User Id
testSubject = 85

userRatings = ml.getUserRatings(testSubject)

In [42]:
userRatings

[(2, 5.0),
 (3, 2.0),
 (5, 3.0),
 (10, 5.0),
 (19, 3.0),
 (21, 4.0),
 (23, 3.0),
 (44, 2.0),
 (58, 1.0),
 (110, 5.0),
 (153, 4.0),
 (158, 1.0),
 (160, 3.0),
 (161, 4.0),
 (165, 4.0),
 (170, 3.0),
 (172, 3.0),
 (173, 4.0),
 (177, 2.0),
 (181, 1.0),
 (185, 3.0),
 (186, 3.0),
 (188, 1.0),
 (196, 3.0),
 (203, 3.0),
 (208, 3.0),
 (216, 3.0),
 (227, 4.0),
 (230, 2.0),
 (231, 4.0),
 (234, 4.0),
 (247, 1.0),
 (253, 3.0),
 (255, 5.0),
 (256, 4.0),
 (261, 1.0),
 (275, 3.0),
 (277, 2.0),
 (282, 1.0),
 (288, 4.0),
 (291, 1.0),
 (292, 4.0),
 (293, 5.0),
 (296, 5.0),
 (315, 4.0),
 (316, 5.0),
 (317, 4.0),
 (318, 5.0),
 (327, 1.0),
 (329, 5.0),
 (333, 4.0),
 (339, 1.0),
 (344, 4.0),
 (349, 5.0),
 (350, 3.0),
 (355, 3.0),
 (356, 4.0),
 (357, 3.0),
 (364, 3.0),
 (366, 1.0),
 (367, 4.0),
 (368, 4.0),
 (370, 2.0),
 (374, 1.0),
 (377, 5.0),
 (380, 5.0),
 (405, 3.0),
 (410, 4.0),
 (415, 3.0),
 (420, 1.0),
 (432, 4.0),
 (434, 3.0),
 (437, 3.0),
 (442, 4.0),
 (454, 3.0),
 (457, 5.0),
 (466, 3.0),
 (471, 3.0)

In [43]:
len(userRatings)

107

In [44]:
from surprise import SVD

In [45]:
def BuildAntiTestSetForUser(testSubject, trainset):    
    fill = trainset.global_mean

    anti_testset = []
    
    u = trainset.to_inner_uid(str(testSubject))
    
    user_items = set([j for (j, _) in trainset.ur[u]])
    anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                             i in trainset.all_items() if
                             i not in user_items]
    return anti_testset

In [46]:
userRatings = ml.getUserRatings(testSubject)
loved = []
hated = []

for ratings in userRatings:
    if (float(ratings[1]) > 4.0):
        loved.append(ratings)
    if (float(ratings[1]) < 3.0):
        hated.append(ratings)

In [47]:
print("\nUser ", testSubject, " loved these movies:")
for ratings in loved:
    print(ml.getMovieName(ratings[0]))
    
print("\n...and didn't like these movies:")
for ratings in hated:
    print(ml.getMovieName(ratings[0]))


User  85  loved these movies:
Jumanji (1995)
GoldenEye (1995)
Braveheart (1995)
Jerky Boys, The (1995)
LÃ©on: The Professional (a.k.a. The Professional) (LÃ©on) (1994)
Pulp Fiction (1994)
Stargate (1994)
Shawshank Redemption, The (1994)
Star Trek: Generations (1994)
Clear and Present Danger (1994)
Speed (1994)
True Lies (1994)
Fugitive, The (1993)
Jurassic Park (1993)
Terminator 2: Judgment Day (1991)
Mission: Impossible (1996)
Rock, The (1996)

...and didn't like these movies:
Grumpier Old Men (1995)
Mortal Kombat (1995)
Postman, The (Postino, Il) (1994)
Casper (1995)
Lord of Illusions (1995)
Mighty Morphin Power Rangers: The Movie (1995)
Prophecy, The (1995)
Dolores Claiborne (1995)
Heavenly Creatures (1994)
Little Women (1994)
Miracle on 34th Street (1994)
Nell (1994)
Poison Ivy II (1996)
Tank Girl (1995)
While You Were Sleeping (1995)
Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)
Naked Gun 33 1/3: The Final Insult (1994)
Richie Rich (1994)


In [48]:
# For recommendation model
trainSet = data.build_full_trainset()

In [49]:
algo = SVD()
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x202759c2640>

In [50]:
testSet = BuildAntiTestSetForUser(testSubject, trainSet)
predictions = algo.test(testSet)

In [51]:
recommendations = []

print ("\nWe recommend:")
for userID, movieID, actualRating, estimatedRating, _ in predictions:
    intMovieID = int(movieID)
    recommendations.append((intMovieID, estimatedRating))

recommendations.sort(key=lambda x: x[1], reverse=True)

for ratings in recommendations[:10]:
    print(ml.getMovieName(ratings[0]))


We recommend:
All the President's Men (1976)
American Beauty (1999)
Usual Suspects, The (1995)
Swingers (1996)
Graduate, The (1967)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
In the Name of the Father (1993)
Chinatown (1974)
Professional, The (Le professionnel) (1981)
Wallace & Gromit: A Close Shave (1995)
