In [2]:
from surprise import KNNWithMeans, KNNBasic, SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [4]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'title': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [12]:
movies_with_ratings[movies_with_ratings.userId == 5.0].title.unique()

array(['Toy Story (1995)', 'Get Shorty (1995)', 'Babe (1995)',
       'Dead Man Walking (1995)', 'Clueless (1995)',
       'Usual Suspects, The (1995)', 'Postman, The (Postino, Il) (1994)',
       'Braveheart (1995)', 'Apollo 13 (1995)', 'Batman Forever (1995)',
       'Eat Drink Man Woman (Yin shi nan nu) (1994)',
       'Heavenly Creatures (1994)',
       'Interview with the Vampire: The Vampire Chronicles (1994)',
       'Little Women (1994)',
       'Like Water for Chocolate (Como agua para chocolate) (1992)',
       'Legends of the Fall (1994)', 'Once Were Warriors (1994)',
       'Pulp Fiction (1994)', 'Quiz Show (1994)', 'Stargate (1994)',
       'Shawshank Redemption, The (1994)',
       'Ace Ventura: Pet Detective (1994)',
       'Clear and Present Danger (1994)',
       'Four Weddings and a Funeral (1994)', 'Lion King, The (1994)',
       'Mask, The (1994)', 'True Lies (1994)',
       'Addams Family Values (1993)', 'Fugitive, The (1993)',
       'In the Line of Fire (1993)', 

In [7]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [8]:
trainset, testset = train_test_split(data, test_size=.15)

In [9]:
%%time

algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Wall time: 686 ms


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1f445c1fdf0>

In [10]:
test_pred = algo.test(testset)

In [11]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8988


0.8987606870561629

In [16]:
algo.predict(uid=2.0, iid='Dead Man Walking (1995)').est

3.8871297965337375

In [17]:
%%time

algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

Wall time: 1.48 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f449dee940>

In [18]:
test_pred = algo.test(testset)

In [19]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8728


0.8728200755136463

In [20]:
algo.predict(uid=2.0, iid='Dead Man Walking (1995)').est

4.104707129065346

In [24]:
current_user_id=2.0
user_movies=movies_with_ratings[movies_with_ratings==current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [26]:
sorted(scores)[-10:]

[4.473868683227807,
 4.478294566173173,
 4.4830395277994555,
 4.500935587845981,
 4.515014267977555,
 4.528357855784767,
 4.539791552853071,
 4.565438614975804,
 4.585010136114638,
 4.604039524774537]

In [27]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-','').split('|'))

In [28]:
movie_genres=[change_string(g) for g in movies.genres.values]

In [29]:
movie_genres[0]

'Adventure Animation Children Comedy Fantasy'

In [34]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transf = TfidfTransformer()
X_train_tfidf = tfidf_transf.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_tfidf)

test = change_string("Adventure|Comedy|Fantasy|Crime")
predict = count_vect.transform([test])
X_tfidf2 = tfidf_transf.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [35]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]],
       dtype=int64))