# Load libraries

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import Reader
from surprise import get_dataset_dir
import os
from surprise.model_selection import cross_validate
from collections import defaultdict
import pandas as pd
import io
import plotly.graph_objects as go

# 1. We will be using an algorithm that won the Netflix prize years back and leverages matrix factorization to recommend movies 

To determine the best parameters for our model, we will begin with performing grid search to iterate over the possible paramters and cross validate models to find the best combination. 

Note 1: Due to randomness of the algorithm initialization, predictions vary with each run. 
Note 2: This model architecture is highly biased towards popular and highly rated movies which may make intuitive sense, but isn't likely to show people movies they haven't already considered watching.

In [None]:
data = Dataset.load_builtin("ml-100k")

param_grid = {
    "n_epochs": [20, 30, 40],
    "reg_all": [0.02, 0.04, 0.08],
    "lr_all": [0.002, 0.005]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

#best performance but I like the results given by the defaults parameters
#0.9249955773675692
#{'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.08}

Parameter tuning is very important. An initial model I trained with poorly tuned parameters ended up recommending the Wallace and Gromit film "A Close Shave" to most users https://www.imdb.com/title/tt0112691/

Be careful with grid search however; depending on the combination of training dataset, algorithm, and most importantly the parameters you give it, grid search may optimize the objective and still produce some very odd and unhelpful results. If you give it a set of bad parameters, it can only select the best combination of a bad set of options. Always review the output and ask yourself, do these answers make sense? 

For instance, with this dataset, you can train a model with a low rmse but it leads to most people being recommended a small set of movies and in extreme cases, the same movie is recommended for nearly everyone. This happened with a little known movie known as Pather Panchali (1955) which has a high average rating, indicating it may be a very niche film and probably not a great recommendation for most people.

# 2. Now we will load our dataset of movie ratings by users, train a model, and output predictions

In [None]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = model.test(testset)

top_n = get_top_n(predictions, n=15)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

# 3. Let's convert movie IDs to actual titles so we know what we are recommending

In [None]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

# 4. Let's define a new function to print a UID mapped to movie titles

In [None]:
rid_to_name, name_to_rid = read_item_names()
def get_top_n_titles(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((rid_to_name[iid], est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
top_titles = get_top_n_titles(predictions, n=15)
for uid, user_ratings in top_titles.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
user = []
filmnames = []
for uid, user_ratings in top_titles.items():
    user.append(uid)
    filist = [iid for (iid, _) in user_ratings]
    #fname = [rid_to_name[i] for i in filist]
    filmnames.append(", ".join(filist))
d = {
    'User':user,
    'Recommended Films':filmnames
}
filmDF = pd.DataFrame(d)

In [None]:
filmDF

# KNN approach allows for findings similar movies

In [None]:
# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
knn = KNNBaseline(sim_options=sim_options)
knn.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie
def name_to_inner_id(filmname):
    raw_id = name_to_rid[filmname]
    return knn.trainset.to_inner_iid(raw_id)

# Retrieve inner id of the movie with raw id
def raw_to_inner_id(raw_id):
    return knn.trainset.to_inner_iid(raw_id)

# Retrieve inner ids of the nearest neighbors to film.
def get_neighbors(inner_id, n):
    return knn.get_neighbors(inner_id, k=n)

# Convert inner ids of the neighbors into names.
def get_neighbor_names(neighbors):
    neighbor_names = (knn.trainset.to_raw_iid(inner_id) for inner_id in neighbors)
    return (rid_to_name[rid] for rid in neighbor_names)

def similar_movies(filmID, n):
    m = raw_to_inner_id(filmID)
    f = get_neighbors(m, n)
    return get_neighbor_names(f)

def similar_movies_by_name(film, n):
    filmID = name_to_rid[film]
    m = raw_to_inner_id(filmID)
    f = get_neighbors(m, n)
    return get_neighbor_names(f)

neighbor_names = similar_movies_by_name('Dead Poets Society (1989)', 10)
#print()
print('The 10 nearest neighbors of Dead Poets Society (1989) are:')
for movie in neighbor_names:
    print(movie)

Make a dataframe of users to top films suggested by KNN

In [None]:
user = []
filmnames = []
for uid, user_ratings in top_n.items():
    user.append(uid)
    filist = [iid for (iid, _) in user_ratings]
    fname = [rid_to_name[i] for i in filist]
    filmnames.append(", ".join(fname))
d = {
    'User':user,
    'Recommended Films':filmnames
}
knnDF = pd.DataFrame(d)

In [None]:
knnDF

# Utilities: starter code to work with other datasets

You can edit the code below to load a custom dataset from a directory

In [None]:
# path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

The code below trains a model on a massive dataset of 25 million ratings (requires this dataset: https://grouplens.org/datasets/movielens/25m/)

In [None]:
df = pd.read_csv('./ml-25m/ratings.csv')

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['user id', 'item id', 'ratings']], reader)
#data.split(2)  # data can now be used normally
trainset = data.build_full_trainset()
model25m = SVD()
model25m.fit(trainset)

# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = model25m.test(testset)

# We can explore our data by plotting the movies by average rating and number of ratings to find outliers and discover why our model may be more likely to recommend these movies

In [None]:
data = pd.read_excel("movielens data.xlsx", sheet_name=1)

In [None]:
fig = go.Figure(data=go.Scatter(x=data['avg rating'],
                                y=data['n ratings'],
                                mode='markers',
                                text=data['title']))
fig.update_layout(title='Movie Scatterplot')
fig.update_layout(
    annotations=[
        dict(
            x=0.5,
            y=-0.15,
            showarrow=False,
            text="Average Rating",
            xref="paper",
            yref="paper"
        ),
        dict(
            x=-0.07,
            y=0.5,
            showarrow=False,
            text="Count of Ratings",
            textangle=-90,
            xref="paper",
            yref="paper"
        )
    ]
)
fig.show()