In [29]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

In [19]:
# Read in data
ratings = pd.read_table('data/ratings',names=["user", "movie", "rating", "timestamp"])
movie_info = pd.read_table('data/movie_info',names = ['movie', "movie_title"],usecols=[0,1],sep='|')

In [23]:
ratings.columns

Index([u'user', u'movie', u'rating', u'timestamp'], dtype='object')

In [24]:
movie_info.columns

Index([u'movie', u'movie_title'], dtype='object')

In [25]:
# Merge files into single DataFrame
df = ratings.merge(movie_info,on='movie',how='left')

In [26]:
df.head()

Unnamed: 0,user,movie,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [31]:
# Converts the original DataFrame into a sparse matrix.
max_user_id = df.user.max()
max_movie_id = df.movie.max()
ratings_sparse_mat = sparse.lil_matrix((max_user_id, max_movie_id))
for _, row in df.iterrows():
        # subtract 1 from id's to match 0 indexing
    ratings_sparse_mat[row.user-1, row.movie-1] = row.rating

In [32]:
# Creates the movie similarity matrix using cosine similarity
# Each element represents the smilarity between two movies
item_sim_mat = cosine_similarity(ratings_sparse_mat.T)

In [65]:
# We have 1682 total movies so it seems we have what we want
item_sim_mat.shape

(1682, 1682)

In [34]:
# Get all items rated by specified user_id
user_id = 1
items_rated = ratings_sparse_mat[user_id].nonzero()[1]


In [40]:
items_rated

array([  0,   9,  12,  13,  18,  24,  49,  99, 110, 126, 236, 241, 250,
       254, 256, 257, 268, 271, 272, 273, 274, 275, 276, 277, 278, 279,
       280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
       293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305,
       306, 307, 308, 309, 310, 311, 312, 313, 314, 315], dtype=int32)

In [51]:
# Have a look at the Movies rated by the user 
df[df['user'] == 2][['rating','movie_title']]

Unnamed: 0,rating,movie_title
700,4,Rosewood (1997)
924,5,Shall We Dance? (1996)
1052,5,Star Wars (1977)
3425,1,3 Ninjas: High Noon At Mega Mountain (1998)
5063,4,Ulee's Gold (1997)
5324,3,Fierce Creatures (1997)
6310,3,Midnight in the Garden of Good and Evil (1997)
7519,3,"River Wild, The (1994)"
7973,4,Mighty Aphrodite (1995)
8253,3,Up Close and Personal (1996)


In [56]:
num_items = ratings_sparse_mat.shape[1] # Number of movies
preds = np.zeros(num_items) # Store predictions here


In [59]:
# setting a neighborhood size, sorting similarity matrix from least to greastest,
# neighborhood only contains items of highest similarity within the neighborhood size.
neighborhood_size=75
least_to_most_sim_indexes = np.argsort(item_sim_mat, 1)
neighborhood = least_to_most_sim_indexes[:, -neighborhood_size:]

In [91]:
# See the top 10 most similar movies for movie 1
movie1_top10_sim = least_to_most_sim_indexes[0,-10:]+1
movie1_top10_sim

array([237, 100, 222, 151, 405, 117, 121, 181,  50,   1])

In [120]:
movie1_similarities = []
for item in movie1_top10_sim[::-1]:
    if item in df['movie']:
        movie1_similarities.append(df.loc[df['movie'] == item]['movie_title'].iloc[0])

In [121]:
# Seems like all the movies are relevant to one another. 
# Infact, I've watched all the movies in the top 8
movie1_similarities

['Toy Story (1995)',
 'Star Wars (1977)',
 'Return of the Jedi (1983)',
 'Independence Day (ID4) (1996)',
 'Rock, The (1996)',
 'Mission: Impossible (1996)',
 'Willy Wonka and the Chocolate Factory (1971)',
 'Star Trek: First Contact (1996)',
 'Fargo (1996)',
 'Jerry Maguire (1996)']

In [97]:
#df.loc[df['movie'].isin(movie1_top10_sim)]

Unnamed: 0,user,movie,rating,timestamp,movie_title
12,200,222,5,876042340,Star Trek: First Contact (1996)
24,308,1,4,887736532,Toy Story (1995)
50,251,100,4,886271884,Fargo (1996)
53,25,181,5,885853415,Return of the Jedi (1983)
80,225,237,5,879539643,Jerry Maguire (1996)
112,168,151,5,884288058,Willy Wonka and the Chocolate Factory (1971)
152,84,405,3,883452363,Mission: Impossible (1996)
159,259,117,4,874724988,"Rock, The (1996)"
168,68,117,4,876973939,"Rock, The (1996)"
173,305,117,2,886324028,"Rock, The (1996)"


In [None]:
# Neighborhood contains the 75 most similar movies (75 is the neighborhood size specified earlier)  
# We check for items within the neighborhood & items already rated by the user. This is stored in 'relevant_items'.
# Inorder to predict ratings for a movie not yet rated, we weigh the cosine similarity
# between the target movie and other items the user has already rated.
# The weight is defined by the ratings for movies already seen by the user.
# The weighted sum is then scaled by the sum of similarity measures to ensure predictions
# remain within limits.
# ratings_sparse_mat[user_id, relevant_items] are the ratings of relevant items provided by the user
# item_sim_mat[item_to_rate, relevant_items] are all the similarities between the the target item and item rated by the user. 

for item_to_rate in xrange(num_items):
    relevant_items = np.intersect1d(neighborhood[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)
    preds[item_to_rate] = ratings_sparse_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()

In [124]:
relevant_items = np.intersect1d(neighborhood[0],
                                items_rated,
                                assume_unique=True)

In [130]:
ratings_sparse_mat[1,relevant_items].toarray()

array([[ 4.,  4.,  5.,  5.,  4.,  5.,  4.,  4.,  3.,  3.,  4.,  4.,  4.,
         1.]])

Unnamed: 0,user,movie,rating,timestamp,movie_title
202,1,61,4,878542420,Three Colors: White (1994)
305,1,189,3,888732928,"Grand Day Out, A (1992)"
333,1,33,4,878542699,Desperado (1995)
334,1,160,4,875072547,Glengarry Glen Ross (1992)
478,1,20,4,887431883,Angels and Insects (1995)
639,1,202,5,875072442,Groundhog Day (1993)
687,1,171,5,889751711,Delicatessen (1991)
820,1,265,4,878542441,"Hunt for Red October, The (1990)"
933,1,155,2,878542201,Dirty Dancing (1987)
972,1,117,3,874965739,"Rock, The (1996)"


In [123]:
# This is movie 1 and the 75 most similar movies
neighborhood[0]

array([844, 273,  87,   7, 185,  69, 190, 283, 596, 160, 495, 257,   8,
       392, 142, 233, 275, 131, 567, 152, 264,  88,  96, 281, 215, 195,
       317, 175, 545, 143, 182,  21, 470, 293, 234,  63, 201,  55, 227,
       126,  70,  95,  81, 587, 110, 422, 194, 167, 124, 172,  24, 209,
        97,  14, 741,  78,  94,  68, 203, 256,  27, 171, 117,   6, 173,
       236,  99, 221, 150, 404, 116, 120, 180,  49,   0])