# Introduction to Collaborative Filtering

Dictionary of ratings.

In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}

## Euclidean distance for finding similar users

In [4]:
from math import sqrt
#Toby and LaSalle similarity based on Snakes and Dupree
sqrt(pow(4.0-4.5,2)+pow(2.0-1.0,2))

1.118033988749895

The above result gives us the actual distance between each person. If we wish to have a score that assumes higher values for people who are more similar, we can add 1 to the formula and invert it. All values will now be between 0-1.

In [5]:
1/(1+sqrt(pow(4.0-4.5,2)+pow(2.0-1.0,2)))

0.4721359549995794

In [6]:
#putting everything in a function
def sim_distance(all_critics,person1,person2):
    common_movies={}
    for movie in all_critics[person1]:
        if movie in all_critics[person2]:
            common_movies[movie]=1
            
    if len(common_movies) == 0:
        return 0
    
    sum_of_squares = sum(pow(all_critics[person1][movie]-all_critics[person2][movie],2) for movie in all_critics[person1] if movie in all_critics[person2])
    
    return 1/(1+sqrt(sum_of_squares))

In [7]:
sim_distance(critics,'Lisa Rose','Gene Seymour')

0.29429805508554946

## Pearson correlation coefficient distance for finding similar users

In [8]:
def sim_pearson_custom(all_critics,person1,person2):
    common_movies = {}
    for movie in all_critics[person1]:
        if movie in all_critics[person2]:
            common_movies[movie] = 1
            
    number_of_movies = len(common_movies)
    if(common_movies) == 0:
        return 0
    
    #sum the ratings of person1 and person2
    sum1 = sum([critics[person1][movie] for movie in common_movies])
    sum2 = sum([critics[person2][movie] for movie in common_movies])
    
    #sum the sqaures of the ratings of person1 and person2
    sumSq1 = sum([pow(critics[person1][movie],2) for movie in common_movies])
    sumSq2 = sum([pow(critics[person2][movie],2) for movie in common_movies])
    
    #sum of the products of the ratings of person1 and perosn2 for the formula (sum(x*y)-n*mean(x)*mean(y))
    sumPr = sum([critics[person1][movie]*critics[person2][movie] for movie in common_movies])
    
    
    #calculate Pearson coefficient
    denominator = sqrt((sumSq1 - pow(sum1,2)/number_of_movies)*(sumSq2 - pow(sum2,2)/number_of_movies))
    if denominator == 0:
        return 0;
    numerator = sumPr - (sum1*sum2/number_of_movies)
    
    return numerator/denominator

In [9]:
#SciPy Pearson
from scipy.stats.stats import pearsonr
def sim_pearson(all_critics,person1,person2):
    common_movies = {}
    for movie in all_critics[person1]:
        if movie in all_critics[person2]:
            common_movies[movie] = 1
            
    number_of_movies = len(common_movies)
    if(common_movies) == 0:
        return 0
    else:
        movies_person1 = [critics[person1][movie] for movie in common_movies]
        movies_person2 = [critics[person2][movie] for movie in common_movies]
        return pearsonr(movies_person1,movies_person2)[0]

In [10]:
sim_pearson_custom(critics,'Mick LaSalle','Toby')

0.9244734516419049

In [11]:
sim_pearson(critics,'Mick LaSalle','Toby')

0.924473451641905

In [12]:
sim_pearson(critics,'Lisa Rose','Jack Matthews')

0.747017880833996

## Ranking the critics

In [13]:
def topMatches(all_critics, person, number_of_matches=3, similarity=sim_pearson):
    matches=[(similarity(all_critics,person,other),other) for other in all_critics if other!=person]
    
    matches.sort()
    matches.reverse()
    return matches[0: number_of_matches]

In [14]:
topMatches(critics,'Toby',number_of_matches=5)

[(0.9912407071619304, 'Lisa Rose'),
 (0.924473451641905, 'Mick LaSalle'),
 (0.8934051474415642, 'Claudia Puig'),
 (0.6628489803598702, 'Jack Matthews'),
 (0.3812464258315117, 'Gene Seymour')]

## Recommending movies

In [40]:
#Get recommendations for mavoies for a user, based on a weighted average of every other user's ranking based on how similar the users are to each other
def getRecommendations(all_critics,person,similarity = sim_pearson):
    sum_of_weighted_rankings_per_movie = {}
    sum_of_similarities_per_movie_per_user = {}
    
    for other in all_critics:
        if other == person: continue
        similarity_index = similarity(all_critics,other, person)  
        
        if similarity_index <= 0: continue
        
        for movie in all_critics[other]:
            #movies that the user hasn't seen yet
            if movie not in all_critics[person] or all_critics[person][movie] == 0:
                sum_of_weighted_rankings_per_movie.setdefault(movie,0)
                sum_of_weighted_rankings_per_movie[movie] += all_critics[other][movie]*similarity_index
                
                sum_of_similarities_per_movie_per_user.setdefault(movie,0)
                sum_of_similarities_per_movie_per_user[movie] += similarity_index
                
    recommendations = [(summed_weighted_rating/sum_of_similarities_per_movie_per_user[movie],movie) for movie,summed_weighted_rating in sum_of_weighted_rankings_per_movie.items()]
    
    recommendations.sort()
    recommendations.reverse()
    
    return recommendations;
                
            

In [42]:
getRecommendations(critics,'Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

In [43]:
getRecommendations(critics,'Toby', similarity=sim_distance)

[(3.457128694491423, 'The Night Listener'),
 (2.778584003814924, 'Lady in the Water'),
 (2.422482042361917, 'Just My Luck')]