# Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах
* Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [1]:
# Загружаем необходимые библиотеки
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
# Загружаем данные
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Загружаем данные
tags = pd.read_csv('tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
# Загружаем данные
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Объединяем данные фильмов и рейтинга
movies_ratings = movies.merge(ratings, on='movieId', how='left')
movies_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [6]:
# Объединяем данные фильмов и рейтинга с тегами
movies_with_tags = movies_ratings.merge(tags, on='movieId', how='left')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId_x,rating,timestamp_x,userId_y,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0,567.0,fun,1525286000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0,336.0,pixar,1139046000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0,474.0,pixar,1137207000.0


In [7]:
# Функция преобразования данных в строку, разделённую пробелами
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
# Преобразовываем перечень жанров в строки, разделённые пробелами
movies_with_tags['genres'] = movies_with_tags['genres'].apply(change_string)
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId_x,rating,timestamp_x,userId_y,tag,timestamp_y
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1.0,4.0,964982703.0,336.0,pixar,1139046000.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1.0,4.0,964982703.0,474.0,pixar,1137207000.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1.0,4.0,964982703.0,567.0,fun,1525286000.0
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,5.0,4.0,847434962.0,336.0,pixar,1139046000.0
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,5.0,4.0,847434962.0,474.0,pixar,1137207000.0


In [9]:
# Удаляем теги-дубликаты и отсутствующие значения
movies_with_tags.tag.unique()
movies_with_tags.dropna(inplace=True)

In [10]:
# Преобразуем все жанры в вектора TF-IDF
tfidf_1 = TfidfVectorizer()
tfidf_matrix_1 = tfidf_1.fit_transform(movies_with_tags.genres)
genres = tfidf_1.get_feature_names()
tfidf_matrix_1 = pd.DataFrame(tfidf_matrix_1.toarray(), columns=genres)
tfidf_matrix_1.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Преобразуем все теги в вектора TF-IDF
tfidf_2 = TfidfVectorizer()
tfidf_matrix_2 = tfidf_2.fit_transform(movies_with_tags.tag)
tag = tfidf_2.get_feature_names()
tfidf_matrix_2 = pd.DataFrame(tfidf_matrix_2.toarray(), columns=tag)
tfidf_matrix_2.head()

Unnamed: 0,06,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,250,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Объединим данные в один датафрейм
tfidf_matrix = pd.concat([tfidf_matrix_1, tfidf_matrix_2], axis=1)
tfidf_matrix.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Разделяем данные на тестовые и тренировочные (используем только сведения о жанрах)
X = tfidf_matrix_1
y = movies_with_tags.rating
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.30, random_state=42)

In [14]:
# Строим модель логистической регрессии
model = LinearRegression()
model.fit(X_train_1, y_train_1)

LinearRegression()

In [15]:
# Осуществляем предсказание
y_pred = model.predict(X_test_1)

In [16]:
# Оцениваем RMSE на тестовой выборке
mean_squared_error(y_test_1, y_pred)

0.9099184605837752

In [17]:
# Разделяем данные на тестовые и тренировочные (используем сведения о жанре и тегах)
X = tfidf_matrix
y = movies_with_tags.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [18]:
# Строим модель логистической регрессии
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [19]:
# Осуществляем предсказание
y_pred = model.predict(X_test)

In [20]:
# Оцениваем RMSE на тестовой выборке
mean_squared_error(y_test, y_pred)

3.1960387215560307e+19

Качество модели значительно снизилось. Это может быть связано с тем, что сведения о тегах необходимо дополнительно обработать, так как их большое количество и некоторые из них "задваиваются" (например, zombie и zombie). 

In [21]:
# Определяем для каждого фильма среднюю оценку, количество оценок
movies_agg = movies_with_tags.groupby(by='movieId').agg(['mean', 'count']).rating.reset_index()
movies_agg.head()

Unnamed: 0,movieId,mean,count
0,1,3.92093,645
1,2,3.431818,440
2,3,3.259615,104
3,5,3.071429,98
4,7,3.185185,54


In [22]:
# Объединяем данные фильмов и рейтинга
movies_ratings_agg = movies_with_tags.merge(movies_agg, on='movieId', how='left')
tfidf_matrix = pd.concat([tfidf_matrix_1, movies_ratings_agg[['mean', 'count']]], axis=1)
tfidf_matrix.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western,mean,count
0,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,645
1,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,645
2,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,645
3,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,645
4,0.0,0.343558,0.546036,0.532407,0.249918,0.0,0.0,0.0,0.487746,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,645


In [23]:
# Разделяем данные на тестовые и тренировочные (используем только сведения о жанрах)
X = tfidf_matrix
y = movies_with_tags.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [24]:
# Строим модель логистической регрессии
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [25]:
# Осуществляем предсказание
y_pred = model.predict(X_test)

In [26]:
# Оцениваем RMSE на тестовой выборке
mean_squared_error(y_test, y_pred)

0.8281014417511796

Качество предсказания оценки улучшилось.

Построим рекомендацию для одного пользователя

In [27]:
# Преобразуем данные тегов и векторизуем их
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):


  0%|          | 0/1554 [00:00<?, ?it/s]

In [28]:
# Векторизуем теги
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(tag_strings)

In [29]:
# Обучаем модель ближайших соседей
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [30]:
test = 'highschool pixar'

X_tfidf_test = tfidf.transform([test])
result = neigh.kneighbors(X_tfidf_test, return_distance=True)
result

(array([[0.86703482, 1.13296518, 1.13296518, 1.13296518, 1.13296518,
         1.13296518, 1.13296518, 1.13296518, 1.13296518, 1.13296518]]),
 array([[ 210,  540,  584,  941,  331,  427,  930,  541,  434, 1287]],
       dtype=int64))

In [31]:
# Рекомендуем фильмы
for i in result[1][0]:
    print(movies[i])

Bug's Life, A (1998)
Grease (1978)
Heathers (1989)
Never Been Kissed (1999)
Dead Poets Society (1989)
Fast Times at Ridgemont High (1982)
Napoleon Dynamite (2004)
Grease 2 (1982)
Ferris Bueller's Day Off (1986)
Stand and Deliver (1988)
