In [24]:
import numpy as np
import pandas as pd
from math import sqrt

### Dataset  - User and Movie Metadata

In [73]:


data = pd.read_csv("http://files.grouplens.org/datasets/movielens/ml-100k/u.data", delimiter="\t", header = None)
data.columns = ['user_id','item_id','rating','timestamp']
del data['timestamp']  #delete the column timestamp as it is not needed
data.head(5)


Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [74]:
movie_raw= pd.read_csv("http://files.grouplens.org/datasets/movielens/ml-100k/u.item",
                       delimiter="|",header = None, encoding='latin-1')
movie_raw.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [75]:


movie=movie_raw.iloc[:,[0,1,2,4]]
movie.columns=["Movie_id","Movie_title","Video_Release_date","IMDb URL"]
movie.head(5)

Unnamed: 0,Movie_id,Movie_title,Video_Release_date,IMDb URL
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [76]:
critics = {}
for index,row in data.iterrows():
    try:
        critics[row['user_id']][movie['Movie_title'][row['item_id']-1]] = row['rating']   
    except Exception:
        critics[row['user_id']] = {movie['Movie_title'][row['item_id']-1]:row['rating']}

print(critics)


{196: {'Kolya (1996)': 3, 'Mrs. Doubtfire (1993)': 4, "Muriel's Wedding (1994)": 4, 'Shall We Dance? (1996)': 3, 'Stand by Me (1986)': 5, 'Ace Ventura: Pet Detective (1994)': 5, 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)': 4, 'Raising Arizona (1987)': 4, 'Being There (1979)': 5, 'Truth About Cats & Dogs, The (1996)': 4, 'Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995)': 2, 'Birdcage, The (1996)': 4, 'English Patient, The (1996)': 5, 'Home Alone (1990)': 3, 'American President, The (1995)': 5, 'Babe (1995)': 5, 'Harold and Maude (1971)': 4, 'Up in Smoke (1978)': 4, 'Four Weddings and a Funeral (1994)': 3, 'While You Were Sleeping (1995)': 3, 'Men in Black (1997)': 2, 'Kids in the Hall: Brain Candy (1996)': 4, 'Groundhog Day (1993)': 3, 'Boogie Nights (1997)': 3, "Marvin's Room (1996)": 3, 'Cold Comfort Farm (1995)': 3, 'Adventures of Priscilla, Queen of the Desert, The (1994)': 4, 'Secrets & Lies (1996)': 5, 'Van, The (1996)': 3, 'Waiting for Guffman (1996)': 4, 'N

 #### Finding Similarity using Euclidean distance

The basis of many measures of similarity and dissimilarity is euclidean distance.Euclidean distance is the square root of the sum of squared differences between corresponding elements of the two vectors.Here two vectors are two persons and elements of the vectors are the ratings given by them on the common movies.

In [77]:

def sim_distance(prefs,user_1,user_2):  # Get the list of shared_items  
    si={}       #stores 1 if the movie is rated by both the users, else stores 0
    for item in prefs[user_1]:    
        if item in prefs[user_2]:       
            si[item]=1
    # if they have no ratings in common, return 0  
    if len(si)==0: 
        return 0
    # Add up the squares of all the differences  
    for item in prefs[user_1]:
        if item in prefs[user_2]:
            sum_of_squares=sum([pow(prefs[user_1][item]-prefs[user_2][item],2)])                      
    return(1/(1+sum_of_squares)) 




#### Finding Similarity using Jaccard's Distance


The Jaccard index, also known as Intersection over Union and the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.


In [78]:
def sim_jaccard(prefs, p1, p2):
    numerator = 0
    difference = 0
    for movie in prefs[p1]:
        if movie in prefs[p2]:
            numerator = numerator + 1
        else:
            difference = difference + 1
    denominator = len(prefs[p1].keys()) + difference        
    return numerator/denominator 




#### Finding Similarity using Pearson's correlation coefficient


A slightly more sophisticated way to determine the similarity between people’s inter- ests is to use a Pearson correlation coefficient. The correlation coefficient is a mea- sure of how well two sets of data fit on a straight line. The formula for this is more complicated than the Euclidean distance score, but it tends to give better results in situations where the data isn’t well normalized—for example, if critics’ movie rank- ings are routinely more harsh than average.


In [79]:

def sim_pearson(prefs,p1,p2):  # Get the list of mutually rated items  
    si={}      #stores 1 if the movie is rated by both the users, else stores 0
    for item in prefs[p1]:    
        if item in prefs[p2]: 
            si[item]=1
    # Find the number of elements  
    n=len(si)
    # if they are no ratings in common, return 0  
    if n==0: 
        return 0
    # Add up all the preferences  
    sum1=sum([prefs[p1][it] for it in si])  
    sum2=sum([prefs[p2][it] for it in si])
    # Sum up the squares  
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])  
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    # Sum up the products  
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    # Calculate Pearson score  
    num=pSum-(sum1*sum2/n)  
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))  
    if den==0: 
        return 0
    r=num/den
    return r


In [80]:
sim_distance(critics,451,266) #Similarity score(Euclidean) of users having userid 451 and userid 266

0.5

In [81]:
sim_jaccard(critics,451,266)  #Similarity score(Jacard) of users having userid 451 and userid 266

0.043478260869565216

In [82]:
sim_pearson(critics,451,266)  #Similarity score(Pearson) of users having userid 451 and userid 266

-0.46188021535170054


#### Function to find the top matching users for a particular user


In [83]:


# Number of results and similarity function are optional params. 
def topMatches(prefs,person,n,similarity):  # n = No. of outputs needed
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person]
    scores.sort()  
    scores.reverse()  
    return scores[0:n]



In [84]:
topMatches(critics,2,5,sim_pearson) # Top 5 matches for user with userid 2

[(1.0, 914), (1.0, 607), (1.0, 426), (1.0, 187), (1.0, 167)]


#### Function to get recommendations of top 10 unwatched movies


In [85]:
def getRecommendations(prefs,person,similarity):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items( )]
    # Return the sorted list
    rankings.sort( )
    rankings.reverse( )
    return rankings[:10]

In [86]:
critics[196].keys()  #Movie names of watched(rated) movies for user with userid 196

dict_keys(['Kolya (1996)', 'Mrs. Doubtfire (1993)', "Muriel's Wedding (1994)", 'Shall We Dance? (1996)', 'Stand by Me (1986)', 'Ace Ventura: Pet Detective (1994)', 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)', 'Raising Arizona (1987)', 'Being There (1979)', 'Truth About Cats & Dogs, The (1996)', 'Englishman Who Went Up a Hill, But Came Down a Mountain, The (1995)', 'Birdcage, The (1996)', 'English Patient, The (1996)', 'Home Alone (1990)', 'American President, The (1995)', 'Babe (1995)', 'Harold and Maude (1971)', 'Up in Smoke (1978)', 'Four Weddings and a Funeral (1994)', 'While You Were Sleeping (1995)', 'Men in Black (1997)', 'Kids in the Hall: Brain Candy (1996)', 'Groundhog Day (1993)', 'Boogie Nights (1997)', "Marvin's Room (1996)", 'Cold Comfort Farm (1995)', 'Adventures of Priscilla, Queen of the Desert, The (1994)', 'Secrets & Lies (1996)', 'Van, The (1996)', 'Waiting for Guffman (1996)', 'Nutty Professor, The (1996)', 'Fish Called Wanda, A (1988)', 'Mighty Aphrodite (1995)',

In [87]:
getRecommendations(critics,196,sim_pearson) 

[(5.0, "Someone Else's America (1995)"),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Prefontaine (1997)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Angel Baby (1995)'),
 (5.0, 'Aiqing wansui (1994)'),
 (4.999999999999999, 'Star Kid (1997)'),
 (4.679375562460004, 'Ayn Rand: A Sense of Life (1997)'),
 (4.657515222127359, 'Guantanamera (1994)')]