In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.shape, tags.shape, ratings.shape

((9742, 3), (3683, 4), (100836, 4))

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.movieId.max()

193609

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Самые активные пользователи

In [7]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

Едем дальше

In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [9]:
movies_with_tags = pd.merge(movies, tags, on='movieId', how='left')
movies_with_tags.movieId.unique().shape, movies_with_tags.shape, tags.shape

((9742,), (11853, 6), (3683, 4))

In [10]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


### Собираем и объединяем теги для каждого фильма от разных пользователей

In [11]:
tag_strings = []
movies_list = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm(movies_with_tags.groupby('title')):


HBox(children=(FloatProgress(value=0.0, max=9737.0), HTML(value='')))




In [12]:
movies_list[:10]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)']

In [13]:
tag_strings[:10]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'nan']

In [14]:
len(movies_list), len(tag_strings)

(9737, 9737)

### Уберём nan

In [15]:
tag_strings = [str(s).replace('nan', '') for s in tag_strings]

In [16]:
df = pd.DataFrame(movies_list, columns = ['title'])
df['tag'] = tag_strings
df.head(10)

Unnamed: 0,title,tag
0,'71 (2014),
1,'Hellboy': The Seeds of Creation (2004),
2,'Round Midnight (1986),
3,'Salem's Lot (2004),
4,'Til There Was You (1997),
5,'Tis the Season for Love (2015),
6,"'burbs, The (1989)",
7,'night Mother (1986),
8,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...
9,*batteries not included (1987),


### Объедним с предыдущим dataframe'ом и удалим лишнее

In [17]:
movies_with_tags = pd.DataFrame.merge(movies_with_tags, df, on='title')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,pixar pixar fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,pixar pixar fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,pixar pixar fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,fantasy magicboardgame RobinWilliams game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,fantasy magicboardgame RobinWilliams game


In [18]:
def change_string(s):
    return '|' + s.replace(' ', '|')

In [19]:
movies_with_tags['tag_y'] = movies_with_tags['tag_y'].apply(change_string) 
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,|pixar|pixar|fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,|pixar|pixar|fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,|pixar|pixar|fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game


In [20]:
movies_with_tags['tags_genres'] = movies_with_tags['genres'] + movies_with_tags['tag_y']
movies_with_tags.drop(['genres', 'tag_x', 'tag_y'], axis=1, inplace=True)
movies_with_tags.head()

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
0,1,Toy Story (1995),336.0,1139046000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
1,1,Toy Story (1995),474.0,1137207000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
2,1,Toy Story (1995),567.0,1525286000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
3,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...
4,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...


In [21]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...


In [22]:
def change_string(s):
    return s.replace('|', ' ')

In [23]:
movies_with_tags['tags_genres'] = movies_with_tags['tags_genres'].apply(change_string) 

In [24]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...


### Добавим средние оценки юзеров, а потом и фильмов

In [25]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
df = ratings.groupby(['userId']).agg({'rating': 'mean'}).reset_index()
df.head()

Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [27]:
### Теперь средний рейтинг нужно обратно добавить в ratings, чтобы потом соединить с movies_with_tags по movieId

In [28]:
ratings = pd.DataFrame.merge(ratings, df, on='userId')

In [29]:
ratings.rename(columns={'rating_x': 'rating', 'rating_y': 'avg_user_rating' }, inplace=True)

In [30]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,avg_user_rating
0,1,1,4.0,964982703,4.366379
1,1,3,4.0,964981247,4.366379
2,1,6,4.0,964982224,4.366379
3,1,47,5.0,964983815,4.366379
4,1,50,5.0,964982931,4.366379
5,1,70,3.0,964982400,4.366379
6,1,101,5.0,964980868,4.366379
7,1,110,4.0,964982176,4.366379
8,1,151,5.0,964984041,4.366379
9,1,157,5.0,964984100,4.366379


In [31]:
ratings = pd.DataFrame.merge(ratings, movies_with_tags, on=['movieId'])
ratings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,avg_user_rating,title,userId_y,timestamp_y,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),567.0,1525286000.0,Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...


In [32]:
#переименуем столбцы
ratings.rename(columns={'userId_x': 'userId', 'timestamp_x': 'timestamp_rating'}, inplace=True)
ratings.drop(['userId_y', 'timestamp_y'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp_rating,avg_user_rating,title,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...


### Средняя оценка за фильм

In [33]:
df = ratings.groupby('movieId').agg({'rating': 'mean'}).reset_index()
df.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [34]:
ratings = ratings.merge(df, on='movieId')
ratings.rename(columns={'rating_y': 'avg_movie_ratings'}, inplace=True)

In [35]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [36]:
### удалим дубликатЫ
len(ratings)

285762

In [37]:
ratings.drop_duplicates(inplace=True)
len(ratings)

100836

### Надём id самого активного пользователя

In [38]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

In [39]:
ratings[ratings.userId == 414].head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
438,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
717,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
818,414,6,3.0,961515642,3.391957,Heat (1995),Action Crime Thriller,3.946078
1277,414,47,4.0,961681857,3.391957,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twistending serialkiller,3.975369
2330,414,50,5.0,961681714,3.391957,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thril...,4.237745


### Алгоритм
1. Найти фильмы, которым пользователь поставил оценки 4 и 5. Собрать теги
2. От этих фильмов найти кратчайшее расстояние до всех фильмов
3. Исключить фильмы, которые пользователь смотрел
4. Вывести результат

### Вопрос: правильный это алгоритм или нужно во втором шаге нужно сразу искать расстояние до фильмов, которые пользователь НЕ смотрел? Почему?

In [40]:
user_fav_movies = ratings[(ratings.userId == 414) & (ratings.rating_x > 3)]

In [41]:
len(user_fav_movies)

1459

In [42]:
#Теги любимых фильмов
user_summary_vector = [change_string(t_g) for t_g in user_fav_movies.tags_genres.values]
# Удаляем дубликаты
user_summary_vector = list(dict.fromkeys(user_summary_vector))
user_summary_vector[:10]

['Adventure Animation Children Comedy Fantasy pixar pixar fun',
 'Comedy Romance moldy old',
 'Mystery Thriller mystery twistending serialkiller',
 'Crime Mystery Thriller mindfuck suspense thriller tricky twistending heist',
 'Adventure Comedy Crime Romance crime offbeatcomedy quirky',
 'Action Drama War beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland',
 'Action Drama Romance War ',
 'Comedy War ',
 'Action Romance Western ',
 'Comedy cynical hilarious independentfilm quirky witty generationX']

In [43]:
len(user_summary_vector)

845

In [44]:
# Найдём расстояние на всём датасете. Удалим из него дубликаты строк, которые там есть за счёт разных userId

In [45]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
6,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
9,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
12,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [46]:
len(ratings)

100836

In [47]:
df = ratings.drop(['userId', 'rating_x', 'avg_user_rating', 'timestamp_rating'], axis=1)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,movieId,title,tags_genres,avg_movie_ratings
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
645,3,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
749,6,Heat (1995),Action Crime Thriller,3.946078
851,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twistending serialkiller,3.975369
1460,50,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thril...,4.237745


In [48]:
len(df)

9724

### Определяем расстояние на всём датасете

In [49]:
total_summary_vector = [change_string(t_g) for t_g in df.tags_genres.values]
total_summary_vector[:5]

['Adventure Animation Children Comedy Fantasy pixar pixar fun',
 'Comedy Romance moldy old',
 'Action Crime Thriller ',
 'Mystery Thriller mystery twistending serialkiller',
 'Crime Mystery Thriller mindfuck suspense thriller tricky twistending heist']

In [50]:
#Строим модель

In [51]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(total_summary_vector)
np.unique(X_train_counts.toarray())

array([0, 1, 2, 3], dtype=int64)

In [52]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
np.unique(X_train_tfidf.toarray())[:10]

array([0.        , 0.02044747, 0.02669129, 0.02763708, 0.02965491,
       0.02988952, 0.03198414, 0.03325721, 0.0359677 , 0.03719916])

In [53]:
neigh = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=2)

### Ищем кратчайшее расстояние от любимых фильмов пользователя до всех остальных

In [54]:
res = []
for fav_movie in tqdm(user_summary_vector):
    predict = count_vect.transform([fav_movie])
    X_train_tfidf2 = tfidf_transformer.transform(predict)
    res.append(neigh.kneighbors(X_train_tfidf2, return_distance=True))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for fav_movie in tqdm(user_summary_vector):


HBox(children=(FloatProgress(value=0.0, max=845.0), HTML(value='')))




In [55]:
res[:5]

[(array([[0.       , 0.5299753]]), array([[   0, 1600]], dtype=int64)),
 (array([[0.        , 1.22724096]]), array([[ 1, 43]], dtype=int64)),
 (array([[0.        , 0.76288536]]), array([[  3, 369]], dtype=int64)),
 (array([[0.        , 0.91343887]]), array([[  4, 105]], dtype=int64)),
 (array([[0.        , 0.93867211]]), array([[   6, 3490]], dtype=int64))]

In [56]:
res[0][0][0][0], res[0][1][0][0]

(0.0, 0)

In [57]:
res[0][0][0][1], res[0][1][0][1]

(0.5299752977540516, 1600)

In [58]:
res[1][0][0][1], res[1][1][0][1]

(1.2272409635480188, 43)

Нужно отсортировать по кратчайшему расстоянию. Для этого оберну в словарь, а затем в список и отсортирую

In [59]:
movies = {}
for item in res:
    movies[item[0][0][1]] = item[1][0][1]

In [60]:
list_keys = list(movies.keys())
list_keys.sort()
list_keys[:20]

[0.0,
 0.17097181934648664,
 0.18724995931653232,
 0.18780638427015522,
 0.18798190076211213,
 0.19072203424233913,
 0.19616899928346815,
 0.2008189345858794,
 0.20193223295235746,
 0.2071850777384294,
 0.20728851245561541,
 0.20862565012255913,
 0.21241125012114823,
 0.21358058189586043,
 0.2136542661602074,
 0.2156487178634964,
 0.2171592917896356,
 0.21871946949976273,
 0.22130013559772274,
 0.22594678102861676]

In [61]:
rec_movies_ids = []
for key in movies:
    for item in list_keys[:20]:
        if item == key:
            rec_movies_ids.append(movies[key])
#     rec_movies_ids.append(movies[key])

In [62]:
rec_movies_ids[:5]

[2840, 1001, 54, 2097, 3193]

Теперь нужно убрать фильмы, которые пользователь уже смотрел

In [63]:
watched_movies_id = ratings.loc[ratings.userId == 414, 'movieId']
df = ratings[~ratings['movieId'].isin(watched_movies_id)] #список фильмов, которые пользователь не смотрел
df[df['movieId'].isin(rec_movies_ids)]['title'].unique()

array(['Tales from the Crypt Presents: Demon Knight (1995)',
       'Stigmata (1999)', "'Til There Was You (1997)",
       'American Movie (1999)', 'New York Cop (Nyû Yôku no koppu) (1993)',
       'Billy Bathgate (1991)'], dtype=object)

In [64]:
rec_movies_Id = df[df['movieId'].isin(rec_movies_ids)]['movieId'].unique()
rec_movies_Id

array([ 328, 2840,  779, 3007,  284, 5689], dtype=int64)

Проверим, что пользователь действительно не смотрел эти фильмы:

In [65]:
ratings.loc[(ratings['movieId'].isin(rec_movies_Id)) & (ratings['userId'] == 414)]

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings


### Часть 2. Предсказание для пользователя 414

In [66]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
6,7,1,4.5,1106635946,3.230263,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
9,15,1,2.5,1510577970,3.448148,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
12,17,1,4.5,1305696483,4.209524,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [67]:
# Так как буду использовать decision tree, то можно оставить такие колонки как movieId и userId, а вот timepstamp можно удалить

In [68]:
df = ratings.drop(['timestamp_rating', 'title', 'tags_genres'], axis=1)
df = df.loc[df.userId == 414]
df.head()

Unnamed: 0,userId,movieId,rating_x,avg_user_rating,avg_movie_ratings
438,414,1,4.0,3.391957,3.92093
717,414,3,4.0,3.391957,3.259615
818,414,6,3.0,3.391957,3.946078
1277,414,47,4.0,3.391957,3.975369
2330,414,50,5.0,3.391957,4.237745


In [69]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [70]:
clf = DecisionTreeClassifier(max_depth=15) ### как выбрать уровень правильно? 15 много или мало?

In [71]:
df.rating_x = round(df.rating_x)

In [72]:
X = df.drop('rating_x', axis=1)
y = df.rating_x

In [73]:
clf.fit(X, y)  ### обучаем модель на фильмах, которые пользователь смотрел

DecisionTreeClassifier(max_depth=15)

In [75]:
clf.score(X, y)

0.8728687916975537

In [76]:
train_df = ratings[~ratings['movieId'].isin(watched_movies_id)]
train_df = ratings.drop(['timestamp_rating', 'title', 'tags_genres', 'rating_x'], axis=1)
train_df.head()

Unnamed: 0,userId,movieId,avg_user_rating,avg_movie_ratings
0,1,1,4.366379,3.92093
3,5,1,3.636364,3.92093
6,7,1,3.230263,3.92093
9,15,1,3.448148,3.92093
12,17,1,4.209524,3.92093


In [77]:
clf.predict(train_df)

array([4., 4., 4., ..., 4., 5., 5.])

In [78]:
train_df['Predicted_Rating'] = clf.predict(train_df)
train_df.head()

Unnamed: 0,userId,movieId,avg_user_rating,avg_movie_ratings,Predicted_Rating
0,1,1,4.366379,3.92093,4.0
3,5,1,3.636364,3.92093,4.0
6,7,1,3.230263,3.92093,4.0
9,15,1,3.448148,3.92093,4.0
12,17,1,4.209524,3.92093,4.0


In [86]:
train_df.loc[train_df['movieId'].isin(rec_movies_Id)].sort_values(by='Predicted_Rating', ascending=False)

Unnamed: 0,userId,movieId,avg_user_rating,avg_movie_ratings,Predicted_Rating
273747,113,779,3.646667,4.0,5.0
273748,345,779,3.903226,4.0,5.0
284904,594,284,3.924569,4.5,4.0
280866,603,3007,3.507953,3.75,4.0
280865,600,3007,2.991481,3.75,4.0
280864,599,3007,2.64205,3.75,4.0
280863,462,3007,3.406593,3.75,4.0
280862,318,3007,3.755973,3.75,4.0
280861,290,3007,4.142322,3.75,4.0
236780,547,2840,4.142857,2.869565,2.0


In [92]:
ratings.loc[ratings['movieId'].isin([779, 284, 3007])]['title'].unique()

array(["'Til There Was You (1997)", 'American Movie (1999)',
       'New York Cop (Nyû Yôku no koppu) (1993)'], dtype=object)