In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [4]:
joined_ratings['genres_split'] = joined_ratings.genres.str.split('|')
joined_ratings['genres_space'] = joined_ratings.apply(lambda r: ' '.join(r['genres_split']), axis=1)
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,genres_split,genres_space
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,"[Action, Crime, Thriller]",Action Crime Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,"[Mystery, Thriller]",Mystery Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,"[Crime, Mystery, Thriller]",Crime Mystery Thriller


In [5]:
title_num_ratings = {}

for movieId, group in tqdm_notebook(joined_ratings.groupby('movieId')):
    title_num_ratings[movieId] = group.userId.unique().shape[0]

HBox(children=(IntProgress(value=0, max=9724), HTML(value='')))




In [17]:
min_num_ratings = np.min([title_num_ratings[f] for f in title_num_ratings.keys()])
max_num_ratings = np.max([title_num_ratings[f] for f in title_num_ratings.keys()])
mean_num_ratings = np.mean([title_num_ratings[f] for f in title_num_ratings.keys()])
median_num_ratings = np.median([title_num_ratings[f] for f in title_num_ratings.keys()])

In [18]:
title_mean_rating = {}

for movieId, group in tqdm_notebook(joined_ratings.groupby('movieId')):
    title_mean_rating[movieId] = group.rating.mean()

HBox(children=(IntProgress(value=0, max=9724), HTML(value='')))




In [19]:
film_with_our_mark = []

# посчитаем нашу метрику для каждого фильма из датасета
for f in title_num_ratings.keys():
    film_with_our_mark.append(
        (f, title_mean_rating[f] * (title_num_ratings[f] - mean_num_ratings) / (max_num_ratings - min_num_ratings))
    )

In [20]:
best_films_ID = list(sorted(film_with_our_mark, key=lambda x: x[1], reverse=True))[:100]
best_films_ID = [c[0] for c in best_films_ID]

In [21]:
import surprise as s

In [22]:
df_for_surprise = ratings.drop('timestamp', axis=1)

In [23]:
df_for_surprise.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [24]:
reader = s.reader.Reader(rating_scale=(0.5, 5))

In [25]:
dataset = s.dataset.Dataset.load_from_df(df_for_surprise, reader)

In [26]:
dataset, _ = s.model_selection.train_test_split(dataset, test_size=0.01)

In [27]:
algorithm = s.SVD()

In [28]:
algorithm.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x220e913de48>

In [29]:
df_best_films = pd.DataFrame(best_films_ID, columns = ['movieId'])

In [30]:
def recomend(user_id, df_best_films):
    
    df_best_films['Score'] = df_best_films.apply(lambda r: algorithm.predict(user_id, r['movieId']).est, axis=1)
    df_best_films = df_best_films.sort_values('Score', ascending=False)
    df_best_films = df_best_films.merge(movies, on='movieId')[['movieId', 'title', 'Score']]
   
    print("Recomendations")
    print(df_best_films.head(10))

In [31]:
recomend(100, df_best_films)

Recomendations
   movieId                                              title     Score
0      318                   Shawshank Redemption, The (1994)  4.581470
1      457                               Fugitive, The (1993)  4.558140
2    58559                            Dark Knight, The (2008)  4.550598
3     1221                     Godfather: Part II, The (1974)  4.540874
4     1193             One Flew Over the Cuckoo's Nest (1975)  4.494912
5     1704                           Good Will Hunting (1997)  4.491334
6     2959                                  Fight Club (1999)  4.488257
7     4993  Lord of the Rings: The Fellowship of the Ring,...  4.479315
8     1258                                Shining, The (1980)  4.437012
9     3578                                   Gladiator (2000)  4.434869
