In [56]:
# Import
import collections
import numpy as np
import pandas as pd
import os
import sys
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from pathlib import Path

In [57]:
# Configure file path
data_path = os.path.join(sys.path[0], "dataset-csv")
movies_filename = os.path.join(data_path, 'movies.csv')
ratings_filename = os.path.join(data_path, 'ratings.csv')

# Load relevant movies file into dataFrame
film = pd.read_csv(
    movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

# Load relevant ratings file into dataFrame
rating = pd.read_csv(
    ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [58]:
# See movie data example
film.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [59]:
# See ratings data example
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [71]:
# Created a nested dictionary of each user with the movieId and the ratings they provided
watched = collections.defaultdict(dict)
for i in rating.values.tolist():
    watched[i[0]][i[1]] = i[2]

# Create a pivot table with index as userId, columns as movieId, values as rating
# This is user-item matrix btw
rating_pivot = rating.pivot(index='userId', columns = 'movieId',\
                    values='rating').fillna(0)

# Convert the pivot table into a sparse matrix
rating_matrix = csr_matrix(rating_pivot.values)

# Initialise k nearest neighbours model
knn = NearestNeighbors(metric = 'minkowski', algorithm = 'brute')
knn.fit(rating_matrix)

# Initialise k
k = 25

In [72]:
def knn_predict(user_index,k):
    # Find nearest neighbours
    distances, indices = knn.kneighbors(rating_pivot.iloc[user_index, :].values.reshape(1, -1), n_neighbors = k)
    # Films the user has watched
    user_watched = set(watched[rating_pivot.index[user_index]])

    neighbours_watched = {}

    # Print neighbours and their distance from the user
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Closest users to user {}:\n'.format(rating_pivot.index[user_index]-1))

        else:
            print('{0}: {1} - distance: {2}'.format(i, rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

        neighbours_watched[rating_pivot.index[indices.flatten()[i]]] = watched[rating_pivot.index[indices.flatten()[i]]].copy()

        # Save information in order to calculate predicted rating
        for key, v in neighbours_watched[rating_pivot.index[indices.flatten()[i]]].items():
            neighbours_watched[rating_pivot.index[indices.flatten()[i]]][key] = [1 - distances.flatten()[i], v]
    print('----\n')
    
    # Get unwatched movies list
    unwatched_films = []
    for movies in neighbours_watched:
        unwatched_films_list = neighbours_watched[movies].keys() - user_watched.intersection(neighbours_watched[movies].keys())
        for unwatched_movies in unwatched_films_list:
            unwatched_films.append(unwatched_movies)

    # Find unwatched films that are common among neighbours
    common_unwatched = [item for item, count in collections.Counter(unwatched_films).items() if count > 1]

    # Predict rating the user would give for the unwatched films
    common_unwatched_rating = []
    for movie in common_unwatched:
        m = []
        w = []
        for neighbours_movie in neighbours_watched:
            if neighbours_watched[neighbours_movie].get(movie) is not None:
                m.append(neighbours_watched[neighbours_movie].get(movie)[0]*neighbours_watched[neighbours_movie].get(movie)[1])
                w.append(neighbours_watched[neighbours_movie].get(movie)[0])

        common_unwatched_rating.append([np.sum(m)/np.sum(w), movie])
    common_unwatched_rating = sorted(common_unwatched_rating, reverse=True)

    print('20 best recommendations based on what similar users liked:\n')
    for item in common_unwatched_rating[:20]:
        print('{0} - {1} - {2:.2f}'.format(item[1], film.loc[film['movieId'] == item[1]]['title'].values[0], item[0]))
    print('-----\n')

In [73]:
knn_predict(0,k)

Closest users to user 0:

1: 493 - distance: 66.07571411132812
2: 39 - distance: 66.61080932617188
3: 494 - distance: 66.90291595458984
4: 208 - distance: 66.91038513183594
5: 180 - distance: 66.96827697753906
6: 164 - distance: 66.99253845214844
7: 96 - distance: 67.00746154785156
8: 394 - distance: 67.02984619140625
9: 399 - distance: 67.1677017211914
10: 133 - distance: 67.18630981445312
11: 231 - distance: 67.19561004638672
12: 293 - distance: 67.26068878173828
13: 335 - distance: 67.26811981201172
14: 403 - distance: 67.37580871582031
15: 433 - distance: 67.47962951660156
16: 207 - distance: 67.4888916015625
17: 508 - distance: 67.51481628417969
18: 72 - distance: 67.54998016357422
19: 431 - distance: 67.56663513183594
20: 37 - distance: 67.58698272705078
21: 54 - distance: 67.59437561035156
22: 532 - distance: 67.59437561035156
23: 172 - distance: 67.60362243652344
24: 507 - distance: 67.61656951904297
----

20 best recommendations based on what similar users liked:

2194.0 - Unt