In [1]:
import pandas as pd
import numpy as np

from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
movies = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/movies.csv')
ratings = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/ratings.csv')
links = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/links.csv')
tags = pd.read_csv('/Users/gost1/Desktop/Recommender systems/Downloads/tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [6]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [7]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fd6e866a9d0>

In [8]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [9]:
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd6e866ae20>

In [10]:
test_pred = algo.test(testset)

In [11]:
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies:
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est)
    titles.append(movie)

In [12]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [13]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [22]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=50, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=50)

In [23]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for index, row in tqdm_notebook(movies.iterrows()):


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [24]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue

        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)
        
    
    best_indexes = np.argsort(scores)[-5:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [26]:
recommend_for_user(10.0)

Ferris Bueller's Day Off (1986) 3.612753515774943
Trading Places (1983) 3.5884224175416883
Palm Beach Story, The (1942) 3.4930097961691295
Fans (1999) 3.4663628168728593
Calcium Kid, The (2004) 3.3883924659806284
