In [1]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
#load the rating and movie dataset from ml-100k foler, rename and withdraw unnecessary attributes
ratings_data = pd.read_csv('./ml-100k/u.data',names=['user_id','movie_id','rating','timestamp'],sep = '\t' )
movies = pd.read_csv('./ml-100k/u.item', header=None , sep='|', encoding='latin-1')
movies.columns = ['movie_id','movie_title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
ratings_data = ratings_data.drop(columns=['timestamp'])
movies = movies.drop(columns=['release_date', 'video_release_date','IMDb_URL','IMDb_URL','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western'])


In [3]:
#View the movie dataset
movies.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
#View the rating dataset
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [5]:
#Generate a rating matrix from rating dataset
ratings_matrix = ratings_data.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
ratings_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#normalize the rating values by each users mean
data_matrix = ratings_matrix.as_matrix()
user_ratings_mean = np.mean(data_matrix, axis = 1)
data_matrix_demeaned = data_matrix - user_ratings_mean.reshape(-1, 1)
data_matrix_demeaned

array([[ 4.41617122,  2.41617122,  3.41617122, ..., -0.58382878,
        -0.58382878, -0.58382878],
       [ 3.86325803, -0.13674197, -0.13674197, ..., -0.13674197,
        -0.13674197, -0.13674197],
       [-0.08977408, -0.08977408, -0.08977408, ..., -0.08977408,
        -0.08977408, -0.08977408],
       ..., 
       [ 4.9470868 , -0.0529132 , -0.0529132 , ..., -0.0529132 ,
        -0.0529132 , -0.0529132 ],
       [-0.20035672, -0.20035672, -0.20035672, ..., -0.20035672,
        -0.20035672, -0.20035672],
       [-0.34066587,  4.65933413, -0.34066587, ..., -0.34066587,
        -0.34066587, -0.34066587]])

In [7]:
#Setting Up SVD
#U is the left singular user features matrix, it shows how much users like each feature
#S(sigma) is the diagonal matrix of singular values, it should convert to the diagonal matrix form
#V is the right singluar vectos of movies, it represents how relevant each feature is to each movie
#k means keep how many top features that can be thought of the underlying tastes and preferences vectors.
U, S, Vt = svds(data_matrix_demeaned, k = 50)
S = np.diag(S)

In [8]:
#Add the user averages back to predict ratings
all_user_predicted_ratings = np.dot(np.dot(U, S), Vt) + user_ratings_mean.reshape(-1, 1)
all_user_predicted_ratings

array([[  6.48843593e+00,   2.95950335e+00,   1.63498717e+00, ...,
         -3.74342412e-02,  -2.55517107e-02,   2.35129410e-02],
       [  2.34726248e+00,   1.29689261e-01,  -9.89174851e-02, ...,
         -1.17604011e-02,   1.16394619e-02,  -4.69240025e-02],
       [  2.91904694e-01,  -2.63829679e-01,  -1.51454341e-01, ...,
          1.73223667e-02,  -6.64435083e-03,  -9.48036263e-03],
       ..., 
       [  3.11855778e+00,  -4.10623811e-02,   5.46046510e-01, ...,
          5.15663510e-03,   2.90204655e-03,  -6.07396441e-03],
       [  9.43730103e-01,   5.99492015e-01,   4.86033574e-01, ...,
          7.00340988e-03,   1.64608994e-02,   2.74129622e-04],
       [  1.35959040e+00,   2.85632882e+00,   1.77072345e+00, ...,
         -5.39286258e-02,  -2.42473462e-02,   1.07868786e-02]])

In [9]:
#Coverting the rating values range between 1 to 5 by normalising
scaler = MinMaxScaler(copy=True, feature_range=(1, 5))
scaler.fit(all_user_predicted_ratings)
new_all_user_predicted_ratings = scaler.transform(all_user_predicted_ratings)

In [10]:
#Build the predicted ratings matrix
preds_matrix = pd.DataFrame(new_all_user_predicted_ratings, columns = ratings_matrix.columns)
preds_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,4.692997,3.581333,3.239769,3.371439,3.356398,3.992623,3.453737,2.047046,2.735790,4.162110,...,1.944732,1.396039,1.568411,1.466496,1.958418,1.485159,1.542196,1.514341,1.765616,2.067904
1,2.823960,1.818578,2.080555,1.813658,2.374215,2.610483,1.848084,1.980903,2.781670,2.369695,...,1.898166,1.797624,1.812645,1.669921,1.716853,1.620348,1.738002,1.680544,1.980188,1.676431
2,1.896314,1.573446,2.045431,1.520022,2.278388,1.942175,1.720934,2.199057,1.910160,2.108376,...,1.707410,1.768728,2.318526,2.036009,1.752825,1.736509,1.995105,1.868815,1.874701,1.884535
3,1.929941,1.461503,2.174199,1.619230,2.305886,1.951509,1.888496,1.930499,1.845213,1.840817,...,1.991648,2.062482,2.033050,1.901950,2.072053,1.877549,2.020678,1.950779,2.003108,1.888800
4,3.688811,2.944468,2.181806,2.230040,2.697834,2.049101,2.561043,2.338894,2.074158,2.301432,...,1.991091,1.633591,1.926463,1.818689,1.638702,1.789900,1.821166,1.805896,1.794331,1.955733
5,3.637070,2.131655,2.038117,2.350196,2.295382,2.559442,2.316327,3.906206,3.351584,2.318392,...,1.828891,2.283943,1.965101,1.792639,2.270517,1.649011,1.725405,1.688097,1.640967,1.543473
6,2.126243,2.039233,1.603321,3.761257,3.045480,2.263276,4.399622,4.110468,4.377496,3.647968,...,2.258531,2.267108,2.092670,1.972520,2.101121,1.914081,1.957225,1.936155,2.367315,1.833944
7,2.129867,2.467429,2.243931,2.609067,2.193262,1.912126,2.405159,1.907060,1.910542,2.136744,...,1.767711,1.870988,2.160499,1.960612,2.023664,1.749909,1.830954,1.791375,2.014977,1.867612
8,2.005795,1.779618,2.270833,1.469552,2.366122,2.380027,2.108078,1.909432,2.337088,1.870260,...,1.924736,1.906256,2.124368,1.939171,1.853798,1.754882,1.823280,1.789877,1.933969,1.987738
9,1.806332,1.639804,2.070598,3.562655,2.871956,2.173000,3.351319,3.032743,3.584446,2.324373,...,1.628551,2.206217,2.803428,2.403153,2.385730,1.715218,1.767501,1.741968,1.920619,1.687925


In [11]:
#Create a function to recommend movie order by the highest predicted rating that user didn't rate before.
#Get and sort the user's predition, start from index 0
#Merge the result with movie dataset once finished match the user's ID
#Then recommend the moives which are predicted the highest rating and the user didn't rate before.

def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations):
    user_row_number = userID - 1 
    sorted_user_predictions = preds_matrix.iloc[user_row_number].sort_values(ascending=False)
    user_data = original_ratings_df[original_ratings_df.user_id == (userID)]
    user_full = (user_data.merge(movies_df, 
                                 how = 'left', 
                                 left_on = 'movie_id', 
                                 right_on = 'movie_id').
                 sort_values(['rating'], ascending=False))
    
    recommendations = (movies_df[~movies_df['movie_id'].isin(user_full['movie_id'])].
                       merge(pd.DataFrame(sorted_user_predictions).reset_index(), 
                       how = 'left',
                       left_on = 'movie_id',
                       right_on = 'movie_id').
                       rename(columns = {user_row_number: 'recommended_list'}).
                       sort_values('recommended_list', ascending = False).iloc[:num_recommendations, :-1])
    return user_full, recommendations

In [12]:
# Top 10 movies that are recomended for user 
#Modified the user id and number of movies that want to recommend for user in the following variables
insert_user_id = 1
number_of_recommended_movies = 10
already_rated, recommended_list = recommend_movies(preds_matrix, insert_user_id, movies, ratings_data, number_of_recommended_movies)
recommended_list

Unnamed: 0,movie_id,movie_title
978,1251,A Chef in Love (1996)
24,297,Ulee's Gold (1997)
690,963,Some Folks Call It a Sling Blade (1993)
943,1216,Kissed (1996)
440,713,Othello (1995)
459,732,Dave (1993)
1108,1381,Losing Chase (1996)
48,321,Mother (1996)
1282,1555,"Secret Adventures of Tom Thumb, The (1993)"
33,306,"Mrs. Brown (Her Majesty, Mrs. Brown) (1997)"


In [13]:
#Evaluation by surprise library
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy 
from surprise.model_selection import KFold
from surprise.model_selection import PredefinedKFold
from collections import defaultdict
import os

In [14]:
#Assign the path of the dataset
files_dir = os.path.expanduser('./ml-100k/')
reader = Reader('ml-100k')

In [15]:
#Load the test and train dataset that are predifined by some files,
#then merge as a dataset that is tuples containing file paths of testing and training
#Assign a cross-validation literator after merging previous dataset
#Apply SVD prediction algorithm
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1,2,3,4,5)]
data = Dataset.load_from_folds(folds_files, reader = Reader('ml-100k'))
pkf = PredefinedKFold()
algo = SVD()

In [16]:
#Create function for calculating precision and recall of top k
#First map the predictions to each user
#Then sort user ratings by estimated value
#Next find out the numer of relevant items and number of recommended items in top k
#Afterwards find out number of relevant and recommended items in top k
#Finally formulate the calculation of precision and recall of top k 
def precision_recall(predictions, k, threshold):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [17]:
#Evaluate the SVD performance by using root mean squared error function 
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.9499
RMSE: 0.9381
RMSE: 0.9332
RMSE: 0.9301
RMSE: 0.9356


In [18]:
#Print the SVD recall by using precision_recall function 
for trainset, testset in pkf.split(data):
    algo.fit(trainset)
    predictions = algo.fit(trainset).test(testset)
    precisions, recalls = precision_recall(predictions, k=50, threshold=3)
    print('Recall:',sum(rec for rec in recalls.values()) / len(recalls))

Recall: 0.814400019798
Recall: 0.84982473997
Recall: 0.85347946079
Recall: 0.853543827317
Recall: 0.857846489096
