In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
#links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.shape, tags.shape, ratings.shape

((9742, 3), (3683, 4), (100836, 4))

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Самые активные пользователи

In [6]:
ratings.userId.value_counts().head(3)

414    2698
599    2478
474    2108
Name: userId, dtype: int64

Едем дальше

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
movies_with_tags = pd.merge(movies, tags, on='movieId', how='left')
movies_with_tags.movieId.unique().shape, movies_with_tags.shape, tags.shape

((9742,), (11853, 6), (3683, 4))

In [9]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


### Собираем и объединяем теги для каждого фильма от разных пользователей

In [10]:
tag_strings = []
movies_list = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_list.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=9737.0), HTML(value='')))




In [11]:
movies_list[:10]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)']

In [12]:
tag_strings[:10]

['nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'nan',
 'artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'nan']

In [13]:
len(movies_list), len(tag_strings)

(9737, 9737)

### Уберём nan

In [14]:
tag_strings = [str(s).replace('nan', '') for s in tag_strings]

In [15]:
df = pd.DataFrame(movies_list, columns = ['title'])
df['tag'] = tag_strings
df.head(10)

Unnamed: 0,title,tag
0,'71 (2014),
1,'Hellboy': The Seeds of Creation (2004),
2,'Round Midnight (1986),
3,'Salem's Lot (2004),
4,'Til There Was You (1997),
5,'Tis the Season for Love (2015),
6,"'burbs, The (1989)",
7,'night Mother (1986),
8,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...
9,*batteries not included (1987),


### Объедним с предыдущим dataframe'ом и удалим лишнее

In [16]:
movies_with_tags = pd.DataFrame.merge(movies_with_tags, df, on='title')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,pixar pixar fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,pixar pixar fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,pixar pixar fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,fantasy magicboardgame RobinWilliams game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,fantasy magicboardgame RobinWilliams game


In [17]:
def change_string(s):
    return '|' + s.replace(' ', '|')

In [18]:
movies_with_tags['tag_y'] = movies_with_tags['tag_y'].apply(change_string) 
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag_x,timestamp,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,|pixar|pixar|fun
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,|pixar|pixar|fun
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,|pixar|pixar|fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,|fantasy|magicboardgame|RobinWilliams|game


In [19]:
movies_with_tags['tags_genres'] = movies_with_tags['genres'] + movies_with_tags['tag_y']
movies_with_tags.drop(['genres', 'tag_x', 'tag_y'], axis=1, inplace=True)
movies_with_tags.head()

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
0,1,Toy Story (1995),336.0,1139046000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
1,1,Toy Story (1995),474.0,1137207000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
2,1,Toy Story (1995),567.0,1525286000.0,Adventure|Animation|Children|Comedy|Fantasy|pi...
3,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...
4,2,Jumanji (1995),62.0,1528844000.0,Adventure|Children|Fantasy|fantasy|magicboardg...


In [20]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy|Drama|Romance|artistic|Funny|humorous|i...


In [21]:
def change_string(s):
    return s.replace('|', ' ')

In [22]:
movies_with_tags['tags_genres'] = movies_with_tags['tags_genres'].apply(change_string) 

In [23]:
movies_with_tags[movies_with_tags.title == '(500) Days of Summer (2009)']

Unnamed: 0,movieId,title,userId,timestamp,tags_genres
8725,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8726,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8727,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8728,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8729,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8730,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8731,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...
8732,69757,(500) Days of Summer (2009),477.0,1279956000.0,Comedy Drama Romance artistic Funny humorous i...


### Добавим средние оценки юзеров, а потом и фильмов

In [24]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [25]:
df = ratings.groupby(['userId']).mean('rating').reset_index().drop(['timestamp', 'movieId'], axis=1)
df.head()

Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [26]:
### Теперь средний рейтинг нужно обратно добавить в ratings, чтобы потом соединить с movies_with_tags по movieId

In [27]:
ratings = pd.DataFrame.merge(ratings, df, on='userId')

In [28]:
ratings.rename(columns={'rating_x': 'rating', 'rating_y': 'avg_user_rating' }, inplace=True)

In [29]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,avg_user_rating
0,1,1,4.0,964982703,4.366379
1,1,3,4.0,964981247,4.366379
2,1,6,4.0,964982224,4.366379
3,1,47,5.0,964983815,4.366379
4,1,50,5.0,964982931,4.366379
5,1,70,3.0,964982400,4.366379
6,1,101,5.0,964980868,4.366379
7,1,110,4.0,964982176,4.366379
8,1,151,5.0,964984041,4.366379
9,1,157,5.0,964984100,4.366379


In [30]:
ratings = pd.DataFrame.merge(ratings, movies_with_tags, on=['movieId'])
ratings.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,avg_user_rating,title,userId_y,timestamp_y,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),567.0,1525286000.0,Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),336.0,1139046000.0,Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),474.0,1137207000.0,Adventure Animation Children Comedy Fantasy pi...


In [31]:
#переименуем столбцы
ratings.rename(columns={'userId_x': 'userId', 'timestamp_x': 'timestamp_rating'}, inplace=True)
ratings.drop(['userId_y', 'timestamp_y'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp_rating,avg_user_rating,title,tags_genres
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...


### Средняя оценка за фильм

In [32]:
df = ratings.groupby('movieId').mean('rating').reset_index().drop(['userId', 'timestamp_rating', 'avg_user_rating'], axis=1)
df.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [33]:
ratings = ratings.merge(df, on='movieId')
ratings.rename(columns={'rating_y': 'avg_movie_ratings'}, inplace=True)

In [34]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


### Надём id самого активного пользователя

In [35]:
ratings.userId.value_counts().head(3)

414    4214
599    3893
474    3365
Name: userId, dtype: int64

In [36]:
ratings[ratings.userId == 414].head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
438,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
439,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
440,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
717,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
718,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615


In [37]:
user_fav_movies = ratings[(ratings.userId == 414) & (ratings.rating_x > 3)]

In [38]:
len(ratings)

285762

### Алгоритм
1. Определить фильмы, которые пользователь 414 ещё не смотрел
2. Обучить модель на всём дата сете
3. Рассчитать расстояние от фильмов, который пользователь 414 ещё не смотрел, до фильмов на всём датасете
4. Определить фильмы с кратчайшим расстоянием

### Фильмы, которые пользователь ещё не смотрел

In [39]:
watched_movies_id = ratings.loc[ratings.userId == 414, 'movieId']
df = ratings[~ratings['movieId'].isin(watched_movies_id)]

In [40]:
df.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
2684,1,70,3.0,964982400,4.366379,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2685,18,70,3.5,1455735732,3.732072,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2686,19,70,2.0,965704974,2.607397,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2687,34,70,3.5,1162048002,3.418605,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091
2688,45,70,4.0,951170563,3.87594,From Dusk Till Dawn (1996),Action Comedy Horror Thriller,3.509091


### Отберём теги фильмов пользователя 414, которые он ещё не смотрел

In [41]:
user414_not_watched = [change_string(t_g) for t_g in df.tags_genres.values]
user414_not_watched[:5], len(user414_not_watched)

(['Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller ',
  'Action Comedy Horror Thriller '],
 36901)

In [42]:
user414_not_watched = list(dict.fromkeys(user414_not_watched))
user414_not_watched[:10], len(user414_not_watched)

(['Action Comedy Horror Thriller ',
  'Adventure Children Romance ',
  'Action Adventure Romance swashbuckler',
  'Drama Fantasy Romance ghosts',
  'Drama Politics',
  'Adventure Children Musical ',
  'Crime Horror AlfredHitchcock psychology suspenseful tense NormanBates AlfredHitchcock blackandwhite imdbtop250 remade',
  'Drama Romance ',
  'Comedy Musical War ',
  'Horror atmospheric disturbing Horror jacknicholson masterpiece psychological StanleyKubrick suspense StephenKing'],
 1416)

In [43]:
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(summary_vector)
# X_train_counts.todense()[:5]
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# np.unique(X_train_tfidf.toarray())
# X_train_tfidf.shape

### Определяем ценность слов на всём датасете

In [44]:
summary_vector = [change_string(t_g) for t_g in ratings.tags_genres.values]
# Удаляем дубликаты
summary_vector = list(dict.fromkeys(summary_vector))
summary_vector[:10]

['Adventure Animation Children Comedy Fantasy pixar pixar fun',
 'Comedy Romance moldy old',
 'Action Crime Thriller ',
 'Mystery Thriller mystery twistending serialkiller',
 'Crime Mystery Thriller mindfuck suspense thriller tricky twistending heist',
 'Action Comedy Horror Thriller ',
 'Adventure Comedy Crime Romance crime offbeatcomedy quirky',
 'Action Drama War beautifulscenery epic historical inspirational Medieval melgibson Oscar(BestCinematography) revenge swordfight Scotland',
 'Action Drama Romance War ',
 'Comedy War ']

In [45]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(summary_vector)
np.unique(X_train_counts.toarray())

array([0, 1, 2, 3], dtype=int64)

In [46]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
np.unique(X_train_tfidf.toarray())

array([0.        , 0.02217928, 0.03106712, ..., 0.97751228, 0.99002083,
       1.        ])

In [47]:
neigh = NearestNeighbors(n_neighbors=2, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=2)

In [48]:
user414_not_watched[666]

'Action Adventure Children '

In [49]:
predict = count_vect.transform([user414_not_watched[666]])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [50]:
res

(array([[0.        , 0.40634477]]), array([[1487,  883]], dtype=int64))

### Почему первое расстояние равно 0???

In [51]:
print(ratings.iloc[res[1][0]]['tags_genres'])

1487    Crime Mystery Thriller mindfuck suspense thril...
883     Mystery Thriller mystery twistending serialkiller
Name: tags_genres, dtype: object


### Посчитаем расстояние от фильмов, которые пользователь 414  ещё не смотрел до всех фильмов датасета

In [104]:
user414_not_watched[:10]

['Action Comedy Horror Thriller ',
 'Adventure Children Romance ',
 'Action Adventure Romance swashbuckler',
 'Drama Fantasy Romance ghosts',
 'Drama Politics',
 'Adventure Children Musical ',
 'Crime Horror AlfredHitchcock psychology suspenseful tense NormanBates AlfredHitchcock blackandwhite imdbtop250 remade',
 'Drama Romance ',
 'Comedy Musical War ',
 'Horror atmospheric disturbing Horror jacknicholson masterpiece psychological StanleyKubrick suspense StephenKing']

In [56]:
res = []
for not_watched in tqdm(user414_not_watched):
    predict = count_vect.transform([not_watched])
    X_train_tfidf2 = tfidf_transformer.transform(predict)
    res.append(neigh.kneighbors(X_train_tfidf2, return_distance=True))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1416.0), HTML(value='')))




In [57]:
not_watched

'Drama Fantasy Horror Thriller '

In [105]:
res[-1]

(array([[0.        , 0.33162867]]), array([[ 831, 1561]], dtype=int64))

In [71]:
len(res)

1416

In [None]:
# res = []
# for tag in tqdm_notebook(tag_strings):
#     test = tag
#     predict = count_vect.transform([test])
#     X_train_tfidf = tfidf_transformer.transform(predict)
#     res.append(neigh.kneighbors(X_train_tfidf, return_distance=True))

In [137]:
res[:3]

[(array([[0.        , 0.32010925]]), array([[   5, 2234]], dtype=int64)),
 (array([[0.        , 0.54704341]]), array([[ 21, 377]], dtype=int64)),
 (array([[0.        , 0.43117413]]), array([[  45, 2018]], dtype=int64))]

In [107]:
res[0][0][0][0], res[0][1][0][0]

(0.0, 5)

### Почему первое расстояние = 0??? Я так и не понял, что неправильно

In [108]:
res[0][0][0][1], res[0][1][0][1]

(0.3201092542585576, 2234)

In [113]:
movies = {}
for item in res:
    movies[item[0][0][1]] = item[1][0][1]

### Найдём топ 20 фильмов с кратчайшим расстоянием

In [119]:
list_keys = list(movies.keys())
list_keys.sort()
list_keys[:20]


[0.0,
 0.19530353638012235,
 0.2076296456718663,
 0.21459959284764613,
 0.21487545244584133,
 0.21713811249825718,
 0.22190629861867883,
 0.2227838408161367,
 0.22366041988863608,
 0.22607157453518786,
 0.23007629465103446,
 0.2345626162832991,
 0.23547104683641448,
 0.2356809579244237,
 0.23718967177452083,
 0.23924204136831956,
 0.2405132428963808,
 0.24713936213415763,
 0.24725387033359741,
 0.2486349951201388]

In [121]:
rec_movies_ids = []
for key in movies:
    for item in list_keys[:20]:
        if item == key:
            rec_movies_ids.append(movies[key])
#     rec_movies_ids.append(movies[key])

In [122]:
rec_movies_ids

[325,
 450,
 665,
 996,
 840,
 1001,
 1065,
 305,
 495,
 1143,
 435,
 1370,
 1478,
 1629,
 1651,
 1137,
 1812,
 1865,
 1948,
 380]

### Что в итоге будем рекомендовать пользователю 414

In [127]:
ratings[ratings['movieId'].isin(rec_movies_ids)]['title'].unique()

array(['With Honors (1994)', 'True Lies (1994)', 'Coneheads (1993)',
       'Last Man Standing (1996)', 'Die Hard 2 (1990)',
       'Ready to Wear (Pret-A-Porter) (1994)',
       "National Lampoon's Senior Trip (1995)", 'Wide Awake (1998)',
       'Hustler White (1996)', 'Underground (1995)',
       'House Arrest (1996)', 'Tom Jones (1963)',
       'MatchMaker, The (1997)',
       'In the Realm of the Senses (Ai no corrida) (1976)'], dtype=object)

### Убедимся ещё раз, что пользователь эти фильмы не смотрел

In [135]:
l = list(ratings[ratings['movieId'].isin(rec_movies_ids)]['title'].unique())
l

['With Honors (1994)',
 'True Lies (1994)',
 'Coneheads (1993)',
 'Last Man Standing (1996)',
 'Die Hard 2 (1990)',
 'Ready to Wear (Pret-A-Porter) (1994)',
 "National Lampoon's Senior Trip (1995)",
 'Wide Awake (1998)',
 'Hustler White (1996)',
 'Underground (1995)',
 'House Arrest (1996)',
 'Tom Jones (1963)',
 'MatchMaker, The (1997)',
 'In the Realm of the Senses (Ai no corrida) (1976)']

In [158]:
watched_movies = list(ratings.loc[ratings['userId'] == 414, :]['title'].unique())
watched_movies[:5]

['Toy Story (1995)',
 'Grumpier Old Men (1995)',
 'Heat (1995)',
 'Seven (a.k.a. Se7en) (1995)',
 'Usual Suspects, The (1995)']

In [159]:
for movie in l:
    if movie in watched_movies:
        print(movie)

With Honors (1994)
True Lies (1994)
Last Man Standing (1996)
Die Hard 2 (1990)
National Lampoon's Senior Trip (1995)
Wide Awake (1998)


In [163]:
for movie in l:
    if movie in watched_movies:
        l.remove(movie)
l

['True Lies (1994)',
 'Coneheads (1993)',
 'Die Hard 2 (1990)',
 'Ready to Wear (Pret-A-Porter) (1994)',
 'Wide Awake (1998)',
 'Hustler White (1996)',
 'Underground (1995)',
 'House Arrest (1996)',
 'Tom Jones (1963)',
 'MatchMaker, The (1997)',
 'In the Realm of the Senses (Ai no corrida) (1976)']

### Часть 2. Предсказание

In [164]:
from sklearn.tree import DecisionTreeClassifier

In [166]:
ratings.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
0,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
1,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
2,1,1,4.0,964982703,4.366379,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
3,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
4,5,1,4.0,847434962,3.636364,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093


In [167]:
clf = DecisionTreeClassifier(max_depth=5)

### Обучим модель на фильмах, которые пользователь смотрел

In [174]:
df = ratings[ratings['userId'] == 414].drop_duplicates()
df.head()

Unnamed: 0,userId,movieId,rating_x,timestamp_rating,avg_user_rating,title,tags_genres,avg_movie_ratings
438,414,1,4.0,961438127,3.391957,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,3.92093
717,414,3,4.0,961439278,3.391957,Grumpier Old Men (1995),Comedy Romance moldy old,3.259615
818,414,6,3.0,961515642,3.391957,Heat (1995),Action Crime Thriller,3.946078
1277,414,47,4.0,961681857,3.391957,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twistending serialkiller,3.975369
2330,414,50,5.0,961681714,3.391957,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thril...,4.237745


In [203]:
X = df.drop(['rating_x', 'title', 'tags_genres'], axis=1)
y = round(df['rating_x'])

In [204]:
y.unique()

array([4., 3., 5., 2., 1., 0.])

In [205]:
clf.fit(X, y)

DecisionTreeClassifier(max_depth=5)

In [206]:
clf.predict([ [1,1], [3,3] ])

ValueError: Number of features of the model must match the input. Model n_features is 5 and input n_features is 2 