In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [15]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv', index_col='movieId')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [27]:
tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [18]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [19]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [28]:
count_vect_genre = CountVectorizer()
X_train_counts_genre = count_vect_genre.fit_transform(movie_genres)

In [29]:
tfidf_transformer_genre = TfidfTransformer()
X_train_tfidf_genre = tfidf_transformer_genre.fit_transform(X_train_counts_genre)

In [30]:
neigh_genre = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh_genre.fit(X_train_tfidf_genre)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [109]:
# tags_tag = [change_string(g) for g in tags.tag.values]

In [111]:
# count_vect_tag = CountVectorizer(lowercase=True)
# X_train_counts_tag = count_vect_tag.fit_transform(tags_tag)

In [114]:
# tfidf_transformer_tag = TfidfTransformer()
# X_train_tfidf_tag = tfidf_transformer_tag.fit_transform(X_train_counts_tag)

In [140]:
# neigh_tag = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean') 
# neigh_tag.fit(X_train_tfidf_tag)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=10)

In [33]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict_genre = count_vect_genre.transform([test])
X_tfidf2_genre = tfidf_transformer_genre.transform(predict_genre)

res_genre = neigh_genre.kneighbors(X_tfidf2_genre, return_distance=True)

In [34]:
res_genre

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]], dtype=int64))

In [145]:
movies.iloc[res_genre[1][0]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
58972,Nim's Island (2008),Adventure|Comedy|Fantasy
4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [141]:
# test_tag = change_string("MMA|drugs|crime")

# predict_tag = count_vect_tag.transform([test_tag])
# X_tfidf2_tag = tfidf_transformer_tag.transform(predict_tag)

# res_tag = neigh_tag.kneighbors(X_tfidf2_tag, return_distance=True)

In [143]:
# res_tag

(array([[0.79869943, 0.96146421, 0.96146421, 0.96146421, 0.96146421,
         0.96146421, 0.96146421, 0.96146421, 0.96146421, 0.96146421]]),
 array([[   4, 2055,  727, 3434, 3603, 2364, 1519, 2662,    6,  891]],
       dtype=int64))

In [144]:
# tags.iloc[res_tag[1][0]]

Unnamed: 0,userId,movieId,tag,timestamp
4,2,89774,MMA,1445715200
2055,474,6002,drugs,1138307168
727,424,296,drugs,1457844550
3434,599,296,drugs,1498456348
3603,599,1732,drugs,1498456286
2364,474,8645,drugs,1138040155
1519,474,1953,drugs,1137368199
2662,477,57669,drugs,1269832564
6,2,106782,drugs,1445715054
891,424,27020,drugs,1457901575


In [154]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [160]:
movies_with_tags.head(3000)

Unnamed: 0_level_0,title,genres,userId,tag,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1.139046e+09
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1.137207e+09
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1.525286e+09
2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1.528844e+09
2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1.528844e+09
...,...,...,...,...,...
2941,South Pacific (1958),Musical|Romance|War,474.0,island,1.138032e+09
2942,Flashdance (1983),Drama|Romance,,,
2943,Indochine (1992),Drama|Romance,474.0,Vietnam,1.138804e+09
2944,"Dirty Dozen, The (1967)",Action|Drama|War,,,


In [173]:
movies_with_tags.dropna(inplace=True)

In [175]:
tag_strings = []
movies_1 = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_1.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [176]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [183]:
count_vect_tag_1 = CountVectorizer()
X_train_counts_tag_1 = count_vect_tag_1.fit_transform(tag_strings)

In [184]:
tfidf_transformer_tag_1 = TfidfTransformer()
X_train_tfidf_tag_1 = tfidf_transformer_tag_1.fit_transform(X_train_counts_tag_1)

In [185]:
neigh_tag_1 = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh_tag_1.fit(X_train_tfidf_tag_1)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [186]:
tag_strings[822]

'L.A.'

In [188]:
test_tag_1 = change_string('pixar pixar fun')

predict_tag_1 = count_vect_tag_1.transform([test_tag_1])
X_tfidf2_tag_1 = tfidf_transformer_tag_1.transform(predict_tag_1)

res_tag_1 = neigh_tag_1.kneighbors(X_tfidf2_tag_1, return_distance=True)

In [189]:
res_tag_1

(array([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[661, 822, 947, 955, 954, 953, 951, 950, 949, 959]], dtype=int64))

In [190]:
for i in res_tag_1[1][0]:
    print(movies_1[i])

In a Lonely Place (1950)
Magnolia (1999)
Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
Night and Day (1946)
Nicholas Nickleby (2002)
Niagara (1953)
Never Been Kissed (1999)
Network (1976)
Net, The (1995)
Night of the Hunter, The (1955)


In [191]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Отберем оценки проставленные пользователем с userId = 1

In [198]:
user_with_id_1 = ratings[ratings.userId==1]

Рассчитаем "Средние оценки (+ median, variance, etc.)" пользователя, с помощью describe

In [204]:
user_with_id_1.rating.describe()

count    232.000000
mean       4.366379
std        0.800048
min        1.000000
25%        4.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

Пользователь проставил 232 оценки, средний балл - 4,366379, стандартное отклонение - 0,800048

In [205]:
user_with_id_1.rating.median() #медиана

5.0

In [207]:
user_with_id_1.rating.var() #variance

0.640076877145844

Отберем оценки проставленные фильму с movieId = 1

In [210]:
movie_with_id_1 = ratings[ratings.movieId==1]

Рассчитаем "Средние оценки (+ median, variance, etc.)" фильма, с помощью describe

In [211]:
movie_with_id_1.rating.describe()

count    215.000000
mean       3.920930
std        0.834859
min        0.500000
25%        3.500000
50%        4.000000
75%        4.500000
max        5.000000
Name: rating, dtype: float64

Фмльм с идентификатором 1 получил 215 оценок, средний балл - 3,920930, стандартное отклонение - 0,834859

In [213]:
movie_with_id_1.rating.median() #медиана

4.0

In [214]:
movie_with_id_1.rating.var() #variance

0.6969897848293848