In [1]:
!pip install faiss-cpu fuzzywuzzy pandas scipy scikit-learn python-levenshtein umap-learn



You should consider upgrading via the 'C:\Users\yngve\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


# Setup

## Imports

In [2]:
import faiss  # Approximate nearest neighbour
import fuzzywuzzy.process  # Fuzzy search for movie titles
import numpy as np  # Linear algebra
import pandas as pd  # Loading data

from scipy.sparse import csr_matrix  # Sparse matrix
import scipy.sparse.linalg as spla  # Linear algebra with sparse matrices
from sklearn.decomposition import TruncatedSVD  # Fast matrix factorisation

## Download data

In [3]:
ratings = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv')
movies = pd.read_csv('https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv')

## Dummy enconding of the genres

In [4]:
all_genres = set(movies["genres"].str.split("|").sum())
for genre in all_genres:
    movies[genre] = movies["genres"].map(lambda x: genre in x).astype(float)

## Mapping from movie and user ID to row and column index

In [5]:
user_loc = {uid: i for i, uid in enumerate(sorted(ratings["userId"].unique()))}
movie_loc = {mid: i for i, mid in enumerate(sorted(ratings["movieId"].unique()))}

ratings["colId"] = ratings["userId"].map(user_loc)
ratings["rowId"] = ratings["movieId"].map(movie_loc)
movies["rowId"] = movies["movieId"].map(movie_loc)
movies = movies.dropna().sort_values("rowId")

## Look at data

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres,Adventure,Crime,Documentary,Comedy,Film-Noir,Mystery,Horror,...,Animation,Musical,IMAX,(no genres listed),Western,War,Fantasy,Children,Action,rowId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,colId,rowId
0,1,1,4.0,964982703,0,0
1,1,3,4.0,964981247,0,2
2,1,6,4.0,964982224,0,5
3,1,47,5.0,964983815,0,43
4,1,50,5.0,964982931,0,46


## Utilities for mapping the matrix entries to movies

In [8]:
title_to_row = dict(zip(movies["title"], movies["rowId"]))
row_to_title = dict(zip(movies["rowId"], movies["title"]))

def movie_finder(title):
    closest_match = fuzzywuzzy.process.extractOne(title, title_to_row.keys())
    
    return closest_match[0], int(title_to_row[closest_match[0]])

## We are interested in movies similar to Pulp Fiction

In [9]:
# Look up recommendations for Pulp Fiction
movie, movie_index = movie_finder("pulp fiction")
movies.loc[movie_index]

movieId                                       296
title                         Pulp Fiction (1994)
genres                Comedy|Crime|Drama|Thriller
Adventure                                     0.0
Crime                                         1.0
Documentary                                   0.0
Comedy                                        1.0
Film-Noir                                     0.0
Mystery                                       0.0
Horror                                        0.0
Romance                                       0.0
Sci-Fi                                        0.0
Drama                                         1.0
Thriller                                      1.0
Animation                                     0.0
Musical                                       0.0
IMAX                                          0.0
(no genres listed)                            0.0
Western                                       0.0
War                                           0.0


# Content-based filtering with approximate nearest neighbour

In [10]:
# Create data matrix: Only dummy encoded genres
# The movie dataframe is in the correct order since we sorted it by row ID earlier
movie_features = movies[sorted(all_genres)]

# Normalise since then inner product is cosine similarity
content_matrix = movie_features.values / np.linalg.norm(movie_features.values, axis=1, keepdims=True)

# Create approximate nearest neighbour search index
index = faiss.IndexFlatIP(movie_features.shape[1])
index.add(content_matrix)

In [11]:
# Print 10 movies most similar to Pulp Fiction
k = 10
cosines, indices = index.search(content_matrix[[movie_index]], k)

for c, i in zip(cosines.squeeze(), indices.squeeze()):
    print(row_to_title[i], c)

Leaves of Grass (2009) 1.0
Informant!, The (2009) 1.0
In Bruges (2008) 1.0
Party Monster (2003) 1.0
Confessions of a Dangerous Mind (2002) 1.0
Beautiful Creatures (2000) 1.0
Man Bites Dog (C'est arrivé près de chez vous) (1992) 1.0
Freeway (1996) 1.0
Fargo (1996) 1.0
Pulp Fiction (1994) 1.0


# Collaborative filtering

## Construct sparse user-movie matrix

In [12]:
X = csr_matrix((ratings["rating"], [ratings["rowId"], ratings["colId"]]))

## Cosine similarity based on user-movie matrix

In [13]:
# Create normalised data matrix since then inner product is cosine similarity
# Scipy sparse linalg doesn't support keepdims, so we need these transposes to normalise the rows
X_normalised = (X.T / spla.norm(X.T, axis=0)).T

# Create search index
index = faiss.IndexFlatIP(X_normalised.shape[1])
index.add(X_normalised)

In [14]:
# Look up recommendations for Pulp Fiction
movie, movie_index = movie_finder("pulp fiction")
k = 10                          # we want 4 similar vectors
cosines, indices = index.search(X_normalised[[movie_index]], k)

for c, i in zip(cosines.squeeze(), indices.squeeze()):
    print(row_to_title[i], c)

Pulp Fiction (1994) 1.0000001
Silence of the Lambs, The (1991) 0.70938236
Shawshank Redemption, The (1994) 0.70236623
Seven (a.k.a. Se7en) (1995) 0.69765365
Forrest Gump (1994) 0.6855437
Usual Suspects, The (1995) 0.6726159
Braveheart (1995) 0.62762123
Fight Club (1999) 0.6232199
Fargo (1996) 0.61034864
Terminator 2: Judgment Day (1991) 0.6102841


## Cosine similarity based on matrix factorisation

In [15]:
# Run SVD, looking for 20 components
n_components = 20

svd = TruncatedSVD(n_components=n_components, n_iter=50)
Q = svd.fit_transform(X)
Q /= np.linalg.norm(Q, axis=0, keepdims=True)

# Create approximate nearest neighbour search index
index = faiss.IndexFlatIP(n_components)
index.add(Q)

In [16]:
k = 10
cosines, indices = index.search(Q[[movie_index]], k)

for c, i in zip(cosines.squeeze(), indices.squeeze()):
    print(row_to_title[i], c)

Pulp Fiction (1994) 0.06976162
Shawshank Redemption, The (1994) 0.06483554
Silence of the Lambs, The (1991) 0.05622734
Forrest Gump (1994) 0.05151555
Usual Suspects, The (1995) 0.043407068
Schindler's List (1993) 0.042082064
Seven (a.k.a. Se7en) (1995) 0.041054543
Fight Club (1999) 0.04093684
Braveheart (1995) 0.037527155
Dances with Wolves (1990) 0.030905858
