# Movie Recommender 1.0

###### April, 2020

In [1]:
"""
Context
In this lab, you will be implementing a simple movie recommender system.
Dataset details
You will be using the ml-m1 dataset from the MovieLense website.
You will be using movies.dat and rating.dat for building your recommender.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
""" Step 0. load and preprocess data """

# use info. from README.txt to name columns
user_col_names = 'UserID::Gender::Age::Occupation::Zip-code'.split('::')
# load data using '::' as separator
users = pd.read_table(r'ml-1m\users.dat', header = None, sep = '::', names = user_col_names)
users.head()
users.shape
users.info()
users.describe()


# load and preprocess movies.dat
movie_col_names = 'MovieID::Title::Genres'.split('::')
movies = pd.read_table(r'ml-1m\movies.dat', header = None, sep ='::', names = movie_col_names, encoding = 'ISO-8859-1')
movies.head()
movies.shape
movies.info()
movies.describe()


# load and preprocess ratings.dat
rating_col_names = 'UserID::MovieID::Rating::Timestamp'.split('::')
ratings = pd.read_table(r'ml-1m\ratings.dat', header = None, sep ='::', names = rating_col_names)
ratings.head()
ratings.shape
ratings.info()
ratings.describe()

  
  
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
UserID        6040 non-null int64
Gender        6040 non-null object
Age           6040 non-null int64
Occupation    6040 non-null int64
Zip-code      6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
MovieID    3883 non-null int64
Title      3883 non-null object
Genres     3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
UserID       1000209 non-null int64
MovieID      1000209 non-null int64
Rating       1000209 non-null int64
Timestamp    1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB


Unnamed: 0,UserID,MovieID,Rating,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [3]:
#### load and preprocess data ####

""" Step 1. Create m x u matrix with movies as row and users as column"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

R_df = ratings.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# convert data frame to a matrix

#R = R_df.as_matrix() 
R = np.array(R_df)
print(R)

[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]]


In [5]:
""" Step 2. Normalize the matrix """

# normalize by each users mean
user_ratings_mean = np.mean(R, axis = 1)
pd.DataFrame(user_ratings_mean).head()

# normalize it
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
pd.DataFrame(R_demeaned).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,4.940097,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,...,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903,-0.059903
1,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,...,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925,-0.12925
2,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,...,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697,-0.053697
3,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,...,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745,-0.023745
4,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,1.831894,-0.168106,-0.168106,-0.168106,-0.168106,...,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106,-0.168106


In [6]:
""" Step 3. Compute SVD to get U, S, and V. Use np.linalg.svd() """
from scipy.sparse.linalg import svds

# choose 50 of them so 
U, sigma, Vt = svds(R_demeaned, k = 50) 
U.shape
# convert sigma to diagonal matrix to leverage matrix multiplication to get predictions
sigma = np.diag(sigma)
sigma.shape

Vt.shape
V = Vt.T
V.shape

(3706, 50)

In [7]:
""" Step 4. From your V.T select 50 components """
print(Vt.shape, '\n\n', Vt)

(50, 3706) 

 [[-0.07028629  0.02415349 -0.01883837 ...  0.00380736 -0.00049127
   0.00061123]
 [ 0.03681506  0.00346263 -0.01264234 ... -0.00965995 -0.00513455
  -0.02377963]
 [ 0.03495646  0.00904907  0.00823098 ...  0.00157338 -0.00234513
   0.00802561]
 ...
 [-0.03287652  0.01185799 -0.01107445 ... -0.00114772 -0.00294575
  -0.02222119]
 [-0.01776333 -0.03068092 -0.01786526 ...  0.00087071  0.0012666
   0.00435186]
 [ 0.07625855  0.01650222  0.00468327 ... -0.00852744 -0.01020778
   0.00425656]]


In [8]:
# multiply U, sigma, Vt back to get 50 approximations of R
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) # add user mean back for 5 stars
all_user_predicted_ratings.shape
pd.DataFrame(all_user_predicted_ratings).head()

# this is the prediction matrix from which top k movies can be found
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.shape
preds_df.columns
# 6040 users, 3706 movies
preds_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [9]:
""" Step 5. Implement a function that takes movieID as input and then implement cosine similarity
            along with sorting to recommend the top 10 movies. """
            
# Cosine similarity is a metric used to determine how similar the documents are irrespective of their size.

# use columns of preds_df as vectors to compute cosine similarity

def cos_sim(a, b):
	"""Takes 2 vectors a, b and returns the cosine similarity according 
	to the definition of the dot product
	"""
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

def movie_cosine(movieID):
    """Takes one movie ID and returns the cosine similarity """
    n = preds_df.shape[1]
    similarity = []
    for i in range(n):
        # movie ID is 1 larger and index 
        cosim = cos_sim(preds_df.iloc[:, movieID-1], preds_df.iloc[:, i])
        similarity.append(cosim)
    return similarity

def find_top10(movieID):
    """returns the movie IDs and information of the those movies"""
    # find the indices of the cosine similarity from min to max
    alist = movie_cosine(movieID)
    ind = np.argsort(np.array(alist))
    # pick the first 11 of the reversed indices (from max to min)
    top10_ind = ind[::-1][0:11]
    #return the columns names(movie ID) that have the top 11 similarity by broadcasting
    top10 = preds_df.iloc[:, top10_ind].columns[1:]
    top10 = np.array(top10)
    # return the top 10 similar movieID and the fulll information of the movies
    return (top10, '\n', movies.iloc[(top10-1), ])

find_top10(2)

(array([3489,   60,  653,  317, 2162,  673, 2161, 1702, 2054, 1920],
       dtype=int64),
 '\n',
       MovieID                                 Title  \
 3488     3557                     Jennifer 8 (1992)   
 59         60    Indian in the Cupboard, The (1995)   
 652       658                Billy's Holiday (1995)   
 316       319                  Shallow Grave (1994)   
 2161     2230          Always Tell Your Wife (1923)   
 672       679        Run of the Country, The (1995)   
 2160     2229           Pleasure Garden, The (1925)   
 1701     1754                         Fallen (1998)   
 2053     2122           Children of the Corn (1984)   
 1919     1988  Hello Mary Lou: Prom Night II (1987)   
 
                             Genres  
 3488                      Thriller  
 59    Adventure|Children's|Fantasy  
 652                          Drama  
 316                       Thriller  
 2161                        Comedy  
 672                          Drama  
 2160              

In [10]:
find_top10(200)

(array([2295, 1794,   57,  538, 1816,  448,   45, 1875, 2611,  371],
       dtype=int64),
 '\n',
       MovieID                                  Title                 Genres
 2294     2363               Godzilla (Gojira) (1954)          Action|Sci-Fi
 1793     1862                      Species II (1998)          Horror|Sci-Fi
 56         57           Home for the Holidays (1995)                  Drama
 537       541                    Blade Runner (1982)       Film-Noir|Sci-Fi
 1815     1884  Fear and Loathing in Las Vegas (1998)           Comedy|Drama
 447       451                  Flesh and Bone (1993)  Drama|Mystery|Romance
 44         45                      To Die For (1995)           Comedy|Drama
 1874     1943     Greatest Show on Earth, The (1952)                  Drama
 2610     2679                   Finding North (1999)          Drama|Romance
 370       374                     Richie Rich (1994)      Children's|Comedy)

In [11]:
find_top10(3000)

(array([3203, 2707, 2688, 2433, 2841, 3005, 2598, 2676, 2702, 2605],
       dtype=int64),
 '\n',
       MovieID                                              Title  \
 3202     3271                             Of Mice and Men (1992)   
 2706     2775                                     Head On (1998)   
 2687     2756                       Wanted: Dead or Alive (1987)   
 2432     2501                                 October Sky (1999)   
 2840     2909        Five Wives, Three Secretaries and Me (1998)   
 3004     3073                              Sandpiper, The (1965)   
 2597     2666                      It Conquered the World (1956)   
 2675     2744                                      Otello (1986)   
 2701     2770                                   Bowfinger (1999)   
 2604     2673  Eternity and a Day (Mia eoniotita ke mia mera ...   
 
              Genres  
 3202          Drama  
 2706          Drama  
 2687         Action  
 2432          Drama  
 2840    Documentary  
 300