In [23]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [24]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [25]:
movies.shape, tags.shape, ratings.shape

((9742, 3), (3683, 4), (100836, 4))

In [26]:
df = pd.merge(movies, tags, how='outer', on='movieId')
df = pd.merge(df, ratings, how='outer', on=['movieId', 'userId'])
df.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,4.0,1122227000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,4.0,978575800.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.5,1525286000.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,4.0,1528844000.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,4.0,1528844000.0


In [27]:
df['tags_genres'] = df.genres + '|' + df.tag
df.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y,tags_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0,4.0,1122227000.0,Adventure|Animation|Children|Comedy|Fantasy|pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0,4.0,978575800.0,Adventure|Animation|Children|Comedy|Fantasy|pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0,3.5,1525286000.0,Adventure|Animation|Children|Comedy|Fantasy|fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0,4.0,1528844000.0,Adventure|Children|Fantasy|fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0,4.0,1528844000.0,Adventure|Children|Fantasy|magic board game


In [28]:
df.shape

(111054, 9)

In [29]:
df.dropna(inplace=True)
df.shape

(3476, 9)

In [109]:
df_target = (df
    .groupby(['movieId', 'tags_genres'])
    .agg({'rating': 'mean'})).reset_index()

In [110]:
df_target.head()

Unnamed: 0,movieId,tags_genres,rating
0,1,Adventure|Animation|Children|Comedy|Fantasy|fun,3.5
1,1,Adventure|Animation|Children|Comedy|Fantasy|pixar,4.0
2,2,Adventure|Children|Fantasy|Robin Williams,4.0
3,2,Adventure|Children|Fantasy|fantasy,4.0
4,2,Adventure|Children|Fantasy|game,3.0


In [111]:
df_target.shape

(3391, 3)

In [112]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [113]:
summary_vector = [change_string(t_g) for t_g in df_target.tags_genres.values]
summary_vector[:10], len(summary_vector)

(['Adventure Animation Children Comedy Fantasy fun',
  'Adventure Animation Children Comedy Fantasy pixar',
  'Adventure Children Fantasy RobinWilliams',
  'Adventure Children Fantasy fantasy',
  'Adventure Children Fantasy game',
  'Adventure Children Fantasy magicboardgame',
  'Comedy Romance moldy',
  'Comedy Romance old',
  'Comedy pregnancy',
  'Comedy remake'],
 3391)

### Перевод в вектор

In [114]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(summary_vector)
X_train_counts.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [115]:
np.unique(X_train_counts.toarray())

array([0, 1, 2], dtype=int64)

In [116]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [117]:
np.unique(X_train_tfidf.toarray())

array([0.        , 0.10912547, 0.11136562, ..., 0.98177269, 0.98344049,
       1.        ])

In [118]:
np.unique(X_train_tfidf.toarray()).shape

(6104,)

In [119]:
X_train_tfidf.shape

(3391, 1439)

### Добавляем средний рейтинг в вектор

In [123]:
summary_vector = np.append(X_train_tfidf.toarray(), df_target['rating'].to_numpy().reshape(len(df_target) ,1), axis=1)

summary_vector.shape

(3391, 1440)

In [21]:
X_train_tfidf.shape, avg_movie_rating.shape

((3476, 1439), (1464, 2))

### Средние оценки пользователей

In [15]:
avg_user_rating = (df
    .groupby('userId')
    .mean('rating'))[['rating']].reset_index()

In [16]:
avg_user_rating.head()

Unnamed: 0,userId,rating
0,2.0,5.0
1,7.0,1.0
2,18.0,4.125
3,21.0,0.5
4,49.0,4.5


### Поиск активных пользователей