In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [2]:
# Eucilidean Distance Score

from math import sqrt

def sim_distance(prefs, person1, person2):
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item] = 1
            
    if len(si)==0: return 0
    
    sum_of_squares = sum(
        [pow(prefs[person1][item]-prefs[person2][item], 2)
         for item in si]
    )
    
    return 1/(1+sqrt(sum_of_squares))


In [3]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.29429805508554946

In [4]:
# Pearson Correlation Score
# https://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient

def sim_pearson(prefs, p1, p2):
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    
    n = len(si)
    
    if n==0: return 1
    
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])
    
    pSum = sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    num = pSum - (sum1 * sum2 / n)
    den = sqrt((sum1Sq - pow(sum1,2)/n) *\
               (sum2Sq - pow(sum2,2)/n))
    if den==0: return 0
    
    return num/den

In [5]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [6]:
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    scores = [(similarity(prefs, person, other), other)
                 for other in prefs if other!=person]
    scores.sort(key=lambda i:-i[0])
    
    return scores[0:n]

In [7]:
topMatches(critics, 'Toby', n=5)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig'),
 (0.66284898035987, 'Jack Matthews'),
 (0.38124642583151164, 'Gene Seymour')]

In [8]:
def getRecommendations(prefs, person, similarity=sim_pearson):
    totals = {}
    simSums = {}
    for other in prefs:
        if other == person: continue
        sim = similarity(prefs, person, other)
        
        if sim <= 0: continue
        
        for item in prefs[other]:
            if prefs[person].get(item, 0) > 0:
                continue
            totals.setdefault(item, 0)
            totals[item] += prefs[other][item] * sim
            simSums.setdefault(item, 0)
            simSums[item] += sim
            
    rankings = [(total/simSums[item], item) for item, total in totals.items()]

    rankings.sort()
    rankings.reverse()
    return rankings

In [9]:
getRecommendations(critics, 'Toby')

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [10]:
def transformPref(prefs):
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            
            result[item][person] = prefs[person][item]
    return result

In [11]:
movies = transformPref(critics)
topMatches(movies, 'Lady in the Water')

[(0.7637626158259785, 'Snakes on a Plane'),
 (0.4879500364742689, 'Superman Returns'),
 (0.3333333333333333, 'You, Me and Dupree'),
 (-0.6123724356957927, 'The Night Listener'),
 (-0.9449111825230676, 'Just My Luck')]

In [12]:
# Item-Based Filtering

# Building the Item Comparision Dataset

def calculateSimilarItems(prefs, n=10):
    result = {}
    
    itemPrefs = transformPref(prefs)
    c = 0
    
    for item in itemPrefs:
        c += 1
        if c%100 == 0: print("{0} / {1}".format(c, len(itemPrefs)))
            
        scores = topMatches(itemPrefs, item, n=n, similarity=sim_pearson)
        result[item] = scores
    return result

In [13]:
itemsim = calculateSimilarItems(critics)
itemsim

{'Just My Luck': [(0.5555555555555556, 'The Night Listener'),
  (-0.3333333333333333, 'Snakes on a Plane'),
  (-0.42289003161103106, 'Superman Returns'),
  (-0.4856618642571827, 'You, Me and Dupree'),
  (-0.9449111825230676, 'Lady in the Water')],
 'Lady in the Water': [(0.7637626158259785, 'Snakes on a Plane'),
  (0.4879500364742689, 'Superman Returns'),
  (0.3333333333333333, 'You, Me and Dupree'),
  (-0.6123724356957927, 'The Night Listener'),
  (-0.9449111825230676, 'Just My Luck')],
 'Snakes on a Plane': [(0.7637626158259785, 'Lady in the Water'),
  (0.11180339887498941, 'Superman Returns'),
  (-0.3333333333333333, 'Just My Luck'),
  (-0.5663521139548527, 'The Night Listener'),
  (-0.6454972243679047, 'You, Me and Dupree')],
 'Superman Returns': [(0.6579516949597695, 'You, Me and Dupree'),
  (0.4879500364742689, 'Lady in the Water'),
  (0.11180339887498941, 'Snakes on a Plane'),
  (-0.1798471947990544, 'The Night Listener'),
  (-0.42289003161103106, 'Just My Luck')],
 'The Night L

In [17]:
def getRecommendedItems(prefs, itemMatch, user):
    userRatings = prefs[user]
    scores = {}
    totalSim = {}
    
    for (item, rating) in userRatings.items():
        for (similarity, item2) in itemMatch[item]:
            if item2 in userRatings: continue
            
            scores.setdefault(item2, 0)
            scores[item2] += similarity * rating
            
            totalSim.setdefault(item2, 0)
            totalSim[item2] += similarity
            
    rankings = [(score/totalSim[item], item)
                for item, score in scores.items()]
    
    rankings.sort()
    rankings.reverse()
    return rankings

In [18]:
getRecommendedItems(critics, itemsim, 'Toby')

[(3.610031066802182, 'Lady in the Water'),
 (3.531395034185976, 'The Night Listener'),
 (2.9609998607242685, 'Just My Luck')]

In [19]:
# Using the MovieLens Dataset

In [28]:
def loadMovieLens(path='ml-100k'):
    
    movies = {}
    for line in open(path+'/u.item', encoding='ISO-8859-1'):
        (id, title) = line.split('|')[0:2]
        movies[id] = title
        
    prefs = {}
    for line in open(path+'/u.data', encoding='ISO-8859-1'):
        (user, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs

In [29]:
mprefs = loadMovieLens()
mprefs['87']

{'2001: A Space Odyssey (1968)': 5.0,
 'Ace Ventura: Pet Detective (1994)': 4.0,
 'Addams Family Values (1993)': 2.0,
 'Addicted to Love (1997)': 4.0,
 'Adventures of Priscilla, Queen of the Desert, The (1994)': 3.0,
 'Adventures of Robin Hood, The (1938)': 5.0,
 'Air Force One (1997)': 3.0,
 'Air Up There, The (1994)': 3.0,
 'Alien (1979)': 4.0,
 'American President, The (1995)': 5.0,
 'Annie Hall (1977)': 4.0,
 'Apocalypse Now (1979)': 4.0,
 'Babe (1995)': 5.0,
 'Baby-Sitters Club, The (1995)': 2.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 4.0,
 'Bananas (1971)': 5.0,
 'Barcelona (1994)': 3.0,
 'Batman & Robin (1997)': 4.0,
 'Batman (1989)': 3.0,
 'Batman Returns (1992)': 3.0,
 'Big Green, The (1995)': 3.0,
 'Big Squeeze, The (1996)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 4.0,
 'Blues Brothers, The (1980)': 5.0,
 'Boomerang (1992)': 3.0,
 'Boot, Das (1981)': 4.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Bridge on the River

In [30]:
getRecommendations(mprefs, '87')[0:30]

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.800260666069043, 'Mrs. Dalloway (1997)'),
 (4.771240079753505, 'Leading Man, The (1996)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.707354190896574, 'Dangerous Beauty (1998)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747077, 'Casablanca (1942)'),
 (4.512903125553783, 'Four Days in September (1997)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.485151301801343, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.463287461

In [35]:
mItemsim = calculateSimilarItems(mprefs, n=50)

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


In [37]:
getRecommendedItems(mprefs, mItemsim, '87')[0:30]

[(5.0, 'When the Cats Away (Chacun cherche son chat) (1996)'),
 (5.0, 'Wedding Gift, The (1994)'),
 (5.0, 'War Room, The (1993)'),
 (5.0, 'Walking and Talking (1996)'),
 (5.0, 'Virtuosity (1995)'),
 (5.0, 'Very Natural Thing, A (1974)'),
 (5.0, 'Vermin (1998)'),
 (5.0, 'Venice/Venice (1992)'),
 (5.0, 'Van, The (1996)'),
 (5.0, 'Unhook the Stars (1996)'),
 (5.0, 'Twisted (1996)'),
 (5.0, 'Turbulence (1997)'),
 (5.0, 'Traveller (1997)'),
 (5.0, 'Telling Lies in America (1997)'),
 (5.0, 'SubUrbia (1997)'),
 (5.0, 'Stefano Quantestorie (1993)'),
 (5.0, 'Shiloh (1997)'),
 (5.0, 'Shadow of Angels (Schatten der Engel) (1976)'),
 (5.0, 'Second Jungle Book: Mowgli & Baloo, The (1997)'),
 (5.0, 'Scarlet Letter, The (1995)'),
 (5.0, 'Safe Passage (1994)'),
 (5.0, 'Robocop 3 (1993)'),
 (5.0, 'Rhyme & Reason (1997)'),
 (5.0, 'Pink Floyd - The Wall (1982)'),
 (5.0, 'Penny Serenade (1941)'),
 (5.0, 'Office Killer (1997)'),
 (5.0, 'Nénette et Boni (1996)'),
 (5.0, 'Nowhere (1997)'),
 (5.0, 'Newton Boy