In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.shape, tags.shape, ratings.shape

((9742, 3), (3683, 4), (100836, 4))

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies.movieId.max()

193609

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Самые активные пользователи

In [7]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

Едем дальше

In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [9]:
movies_with_tags = pd.merge(movies, tags, on='movieId', how='left')
movies_with_tags.movieId.unique().shape, movies_with_tags.shape, tags.shape

((9742,), (11853, 6), (3683, 4))

In [10]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


### Собираем и объединяем теги для каждого фильма от разных пользователей

In [11]:
tag_strings = []
movies_list = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm(movies_with_tags.groupby('title')):


HBox(children=(FloatProgress(value=0.0, max=9737.0), HTML(value='')))




In [12]:
movies_list[:10]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)']

In [13]:
tag_strings[:10]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'nan']

In [14]:
len(movies_list), len(tag_strings)

(9737, 9737)

### Уберём nan

In [15]:
tag_strings = [str(s).replace('nan', '') for s in tag_strings]

In [16]:
df = pd.DataFrame(movies_list, columns = ['title'])
df['tag'] = tag_strings
df.head(10)

Unnamed: 0,title,tag
0,'71 (2014),
1,'Hellboy': The Seeds of Creation (2004),
2,'Round Midnight (1986),
3,'Salem's Lot (2004),
4,'Til There Was You (1997),
5,'Tis the Season for Love (2015),
6,"'burbs, The (1989)",
7,'night Mother (1986),
8,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...
9,*batteries not included (1987),


### Объедним с предыдущим dataframe'ом и удалим лишнее

In [17]:
movies_with_tags = pd.DataFrame.merge(movies_with_tags, df, on='title')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,pixar pixar fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,pixar pixar fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,pixar pixar fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,fantasy magicboardgame RobinWilliams game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,fantasy magicboardgame RobinWilliams game


In [18]:
def change_string(s):
    return '|' + s.replace(' ', '|')

In [19]:
movies_with_tags['tag_y'] = movies_with_tags['tag_y'].apply(change_string) 
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,|pixar|pixar|fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,|pixar|pixar|fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,|pixar|pixar|fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game


In [20]:
movies_with_tags['tags_genres'] = movies_with_tags['genres'] + movies_with_tags['tag_y']
movies_with_tags.drop(['genres', 'tag_x', 'tag_y'], axis=1, inplace=True)
movies_with_tags.head()

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
0,1,Toy Story (1995),336.0,1139046000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
1,1,Toy Story (1995),474.0,1137207000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
2,1,Toy Story (1995),567.0,1525286000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
3,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...
4,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...


In [21]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...


In [22]:
def change_string(s):
    return s.replace('|', ' ')

In [23]:
movies_with_tags['tags_genres'] = movies_with_tags['tags_genres'].apply(change_string) 

In [24]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...


### Добавим средние оценки юзеров, а потом и фильмов

In [25]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
df = ratings.groupby(['userId']).agg({'rating': 'mean'}).reset_index()
df.head()

Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [27]:
### Теперь средний рейтинг нужно обратно добавить в ratings, чтобы потом соединить с movies_with_tags по movieId

In [28]:
ratings = pd.DataFrame.merge(ratings, df, on='userId')

In [29]:
ratings.rename(columns={'rating_x': 'rating', 'rating_y': 'avg_user_rating' }, inplace=True)

In [30]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,avg_user_rating
0,1,1,4.0,964982703,4.366379
1,1,3,4.0,964981247,4.366379
2,1,6,4.0,964982224,4.366379
3,1,47,5.0,964983815,4.366379
4,1,50,5.0,964982931,4.366379
5,1,70,3.0,964982400,4.366379
6,1,101,5.0,964980868,4.366379
7,1,110,4.0,964982176,4.366379
8,1,151,5.0,964984041,4.366379
9,1,157,5.0,964984100,4.366379


In [31]:
ratings = pd.DataFrame.merge(ratings, movies_with_tags, on=['movieId'])
ratings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,avg_user_rating,title,userId_y,timestamp_y,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),567.0,1525286000.0,Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...


In [32]:
#переименуем столбцы
ratings.rename(columns={'userId_x': 'userId', 'timestamp_x': 'timestamp_rating'}, inplace=True)
ratings.drop(['userId_y', 'timestamp_y'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp_rating,avg_user_rating,title,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...


### Средняя оценка за фильм

In [33]:
df = ratings.groupby('movieId').agg({'rating': 'mean'}).reset_index()
df.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [34]:
ratings = ratings.merge(df, on='movieId')
ratings.rename(columns={'rating_y': 'avg_movie_ratings'}, inplace=True)

In [35]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [36]:
### удалим дубликатЫ
len(ratings)

285762

In [37]:
#ratings.drop_duplicates(inplace=True)
#len(ratings)

### Надём id самого активного пользователя

In [38]:
ratings.userId.value_counts().head(3)

414    4214
599    3893
474    3365
Name: userId, dtype: int64

In [39]:
ratings[ratings.userId == 414].head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
438,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
439,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
440,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
717,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
718,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615


In [40]:
user_fav_movies = ratings[(ratings.userId == 414) & (ratings.rating_x > 3)]

In [41]:
len(user_fav_movies)

2805

### Алгоритм
1. Определить фильмы, которые пользователь 414 ещё не смотрел
2. Обучить модель на фильмах, которые пользователь любит
3. Рассчитать расстояние от фильмов, который пользователь 414 ещё не смотрел, до фильмов которые любит
4. Определить фильмы с кратчайшим расстоянием

### Фильмы, которые пользователь ещё не смотрел

In [42]:
watched_movies_id = ratings.loc[ratings.userId == 414, 'movieId']
df = ratings[~ratings['movieId'].isin(watched_movies_id)]

In [43]:
df.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
2684,1,70,3.0,964982400,4.366379,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2685,18,70,3.5,1455735732,3.732072,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2686,19,70,2.0,965704974,2.607397,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2687,34,70,3.5,1162048002,3.418605,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2688,45,70,4.0,951170563,3.87594,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091


### Отберём теги фильмов пользователя 414, которые он ещё не смотрел

In [44]:
user414_not_watched = [change_string(t_g) for t_g in df.tags_genres.values]
user414_not_watched[:5], len(user414_not_watched)

(['Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller '],
 36901)

In [45]:
user414_not_watched = list(dict.fromkeys(user414_not_watched))
user414_not_watched[:10], len(user414_not_watched)

(['Action Comedy Horror Thriller ',
  'Adventure Children Romance ',
  'Action Adventure Romance swashbuckler',
  'Drama Fantasy Romance ghosts',
  'Drama Politics',
  'Adventure Children Musical ',
  'Crime Horror AlfredHitchcock psychology suspenseful tense NormanBates AlfredHitchcock blackandwhite imdbtop250 remade',
  'Drama Romance ',
  'Comedy Musical War ',
  'Horror atmospheric disturbing Horror jacknicholson masterpiece psychological StanleyKubrick suspense StephenKing'],
 1416)

In [46]:
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(summary_vector)
# X_train_counts.todense()[:5]
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# np.unique(X_train_tfidf.toarray())
# X_train_tfidf.shape

### Определяем ценность слов на датасете 414

In [47]:
summary_vector = [change_string(t_g) for t_g in user_fav_movies.tags_genres.values]
# Удаляем дубликаты
summary_vector = list(dict.fromkeys(summary_vector))
summary_vector[:10]

['Adventure Animation Children Comedy Fantasy pixar pixar fun',
 'Comedy Romance moldy old',
 'Mystery Thriller mystery twistending serialkiller',
 'Crime Mystery Thriller mindfuck suspense thriller tricky twistending heist',
 'Adventure Comedy Crime Romance crime offbeatcomedy quirky',
 'Action Drama War beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland',
 'Action Drama Romance War ',
 'Comedy War ',
 'Action Romance Western ',
 'Comedy cynical hilarious independentfilm quirky witty generationX']

In [48]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(summary_vector)
np.unique(X_train_counts.toarray())

array([0, 1, 2, 3], dtype=int64)

In [49]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
np.unique(X_train_tfidf.toarray())[:10]

array([0.        , 0.02449646, 0.03337591, 0.03444436, 0.03702394,
       0.04018995, 0.04113866, 0.04296216, 0.04683001, 0.04721723])

In [50]:
neigh = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=2)

### Посчитаем расстояние от фильмов, которые пользователь 414  ещё не смотрел до всех фильмов которые он любит

In [51]:
res = []
for not_watched in tqdm(user414_not_watched):
    predict = count_vect.transform([not_watched])
    X_train_tfidf2 = tfidf_transformer.transform(predict)
    res.append(neigh.kneighbors(X_train_tfidf2, return_distance=True))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for not_watched in tqdm(user414_not_watched):


HBox(children=(FloatProgress(value=0.0, max=1416.0), HTML(value='')))




In [52]:
not_watched

'Drama Fantasy Horror Thriller '

In [53]:
#stop

In [54]:
# res = []
# for tag in tqdm_notebook(tag_strings):
#     test = tag
#     predict = count_vect.transform([test])
#     X_train_tfidf = tfidf_transformer.transform(predict)
#     res.append(neigh.kneighbors(X_train_tfidf, return_distance=True))

In [55]:
res[:5]

[(array([[0.        , 0.43527378]]), array([[455, 253]], dtype=int64)),
 (array([[0.56841533, 0.77492636]]), array([[780, 554]], dtype=int64)),
 (array([[0.55723977, 0.76721432]]), array([[284, 741]], dtype=int64)),
 (array([[0.73478284, 0.81548432]]), array([[219, 506]], dtype=int64)),
 (array([[0.88157831, 0.93049544]]), array([[232, 579]], dtype=int64))]

In [56]:
res[0][0][0][0], res[0][1][0][0]

(0.0, 455)

In [57]:
res[0][0][0][1], res[0][1][0][1]

(0.43527378136806805, 253)

In [58]:
res[1][0][0][1], res[1][1][0][1]

(0.7749263645569934, 554)

In [59]:
movies = {}
for item in res:
    movies[item[0][0][1]] = item[1][0][1]

In [60]:
movies[0.43527378136806805]

253

In [69]:
movies

{0.43527378136806805: 253,
 0.7749263645569934: 554,
 0.7672143179369624: 741,
 0.8154843199698167: 506,
 0.9304954374434257: 579,
 0.5916662390980644: 539,
 1.1064379006223053: 488,
 0.5796789412303055: 186,
 0.7103394540756244: 7,
 1.1250355600627187: 507,
 0.4849213036754932: 507,
 0.37896919988755756: 556,
 0.6630124616900345: 15,
 0.3612570962211786: 95,
 0.3914251297696205: 31,
 0.44180449837429553: 290,
 0.33018968095500223: 491,
 0.545496329540249: 346,
 0.4568818324043446: 186,
 0.7993821853366071: 817,
 0.6313705797356073: 389,
 0.3770203067280792: 541,
 0.3749286801842428: 247,
 1.182343595482278: 564,
 0.6324486763515754: 542,
 0.5472488206806433: 75,
 0.36357805431622725: 290,
 0.3441866765989181: 535,
 1.189994944726535: 132,
 0.48999976535514295: 561,
 0.974173008751318: 425,
 1.119736239724817: 444,
 0.4249171685880189: 413,
 0.33549444867517436: 588,
 0.29599521155777386: 604,
 0.39967764505109815: 215,
 0.0: 217,
 0.3683137039007158: 339,
 1.0332684824346636: 163,
 0.

### Найдём топ 20 фильмов с кратчайшим расстоянием

In [61]:
list_keys = list(movies.keys())
list_keys.sort()
list_keys[:20]

[0.0,
 0.2538518855341999,
 0.2657478370020434,
 0.2735149468580201,
 0.2772432056019364,
 0.27840183481553404,
 0.2952274209182285,
 0.29599521155777386,
 0.29854970599908487,
 0.30120381426013254,
 0.30138970855936986,
 0.30508251855906393,
 0.30559284898257466,
 0.3160868005000936,
 0.3164765673432119,
 0.3176441819273204,
 0.3181163796552877,
 0.32173676938450707,
 0.32463826379921185,
 0.3253195173328287]

In [62]:
rec_movies_ids = []
for key in movies:
    for item in list_keys[:20]:
        if item == key:
            rec_movies_ids.append(movies[key])
#     rec_movies_ids.append(movies[key])

In [63]:
rec_movies_ids

[604,
 217,
 303,
 517,
 719,
 40,
 693,
 562,
 472,
 481,
 125,
 110,
 375,
 517,
 47,
 20,
 635,
 679,
 375,
 77]

### Что в итоге будем рекомендовать пользователю 414

In [64]:
ratings[ratings['movieId'].isin(rec_movies_ids)]['movieId'].unique()

array([ 47, 110, 125, 217, 303, 472, 719, 481, 635, 562,  20, 679, 517,
        40,  77], dtype=int64)

In [65]:
ratings.loc[ratings['movieId'] == 604, :]

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings


### Часть 2. Предсказание

In [66]:
from sklearn.tree import DecisionTreeClassifier

In [67]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [68]:
user

NameError: name 'user' is not defined

In [None]:
clf = DecisionTreeClassifier(max_depth=5)

In [None]:
Xcut = X[X.columns[2:4]]

In [None]:
clf.fit(Xcut, y)