In [1]:
# Import
import collections
import numpy as np
import pandas as pd
import os
import sys
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from pathlib import Path
from scipy.spatial.distance import correlation

In [2]:
# Configure file path
data_path = os.path.join(sys.path[0], "dataset-csv")
movies_filename = os.path.join(data_path, 'movies.csv')
ratings_filename = os.path.join(data_path, 'ratings.csv')

# Load relevant movies file into dataFrame
film = pd.read_csv(
    movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

# Load relevant ratings file into dataFrame
rating = pd.read_csv(
    ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
film.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [17]:
# Created a nested dictionary of each user with the movieId and the ratings they provided
watched = collections.defaultdict(dict)
for i in rating.values.tolist():
    watched[i[0]][i[1]] = i[2]

# Create a pivot table with index as userId, columns as movieId, values as rating
# This is user-item matrix btw
rating_pivot = rating.pivot(index='userId', columns = 'movieId',\
                    values='rating').fillna(0)

# Convert the pivot table into a sparse matrix
rating_matrix = csr_matrix(rating_pivot.values)

# Initialise k nearest neighbours model (correlation is pearson correlation coefficient metric)


knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(rating_matrix)

# Initialise k
k = 25

In [22]:
def knn_predict(user_index):
    # Find nearest neighbours
    distances, indices = knn.kneighbors(rating_pivot.iloc[user_index, :].values.reshape(1, -1), n_neighbors = 25)

    # Films the user has watched
    user_watched = set(watched[rating_pivot.index[user_index]])

    neighbours_watched = {}

    # Print neighbours and their distance from the user
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Closest users to user {}:\n'.format(rating_pivot.index[user_index]))

        else:
            print('{0}: {1} - distance: {2}'.format(i, rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

        neighbours_watched[rating_pivot.index[indices.flatten()[i]]] = watched[rating_pivot.index[indices.flatten()[i]]].copy()

        # Save information in order to calculate predicted rating
        for key, v in neighbours_watched[rating_pivot.index[indices.flatten()[i]]].items():
            neighbours_watched[rating_pivot.index[indices.flatten()[i]]][key] = [1 - distances.flatten()[i], v]
    print('----\n')

    unwatched_films = []
    for u in neighbours_watched:
        a = neighbours_watched[u].keys() - user_watched.intersection(neighbours_watched[u].keys())
        for f in a:
            unwatched_films.append(f)

    # Find unwatched films that are common among neighbours
    common_unwatched = [item for item, count in collections.Counter(unwatched_films).items() if count > 1]

    # Predict rating the user would give for the unwatched films
    common_unwatched_rating = []
    for f in common_unwatched:
        m = []
        w = []
        for u in neighbours_watched:
            if neighbours_watched[u].get(f) is not None:
                m.append(neighbours_watched[u].get(f)[0]*neighbours_watched[u].get(f)[1])
                w.append(neighbours_watched[u].get(f)[0])

        common_unwatched_rating.append([np.sum(m)/np.sum(w), f])
    common_unwatched_rating = sorted(common_unwatched_rating, reverse=True)

    print('10 best recommendations based on what similar users liked:\n')
    for f in common_unwatched_rating[:10]:
        print('{0} - {1} - {2:.2f}'.format(f[1], film.loc[film['movieId'] == f[1]]['title'].values[0], f[0]))
    print('-----\n')

In [23]:
knn_predict(2)

Closest users to user 3:

1: 313 - distance: 0.9218126535415649
2: 377 - distance: 0.9283124208450317
3: 532 - distance: 0.9290626049041748
4: 527 - distance: 0.929252028465271
5: 312 - distance: 0.932076632976532
6: 555 - distance: 0.9349051713943481
7: 271 - distance: 0.9356540441513062
8: 561 - distance: 0.9367749691009521
9: 138 - distance: 0.937568187713623
10: 1 - distance: 0.9402797222137451
11: 368 - distance: 0.9428524374961853
12: 469 - distance: 0.9432327151298523
13: 202 - distance: 0.9457123279571533
14: 277 - distance: 0.9479896426200867
15: 160 - distance: 0.9492079019546509
16: 294 - distance: 0.950981080532074
17: 562 - distance: 0.9526667594909668
18: 320 - distance: 0.9533633589744568
19: 186 - distance: 0.9546496868133545
20: 51 - distance: 0.9557033181190491
21: 518 - distance: 0.9560555815696716
22: 217 - distance: 0.956637442111969
23: 514 - distance: 0.957244873046875
24: 303 - distance: 0.9573529362678528
----

10 best recommendations based on what similar user