### Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах
- Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [2]:
import pandas as pd
import numpy as np
import statistics
from sklearn.model_selection import train_test_split

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from scipy.sparse import hstack
from sklearn.metrics import classification_report

%matplotlib inline

In [3]:
links = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/links.csv')
movies = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/movies.csv')
ratings = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/ratings.csv')
tags = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/tags.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


#### Добавим к данным о жанрах информацию о тегах

In [7]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [9]:
movies_with_tags['genres'] = [change_string(g) for g in movies_with_tags.genres.values]

In [10]:
# Удалим пустые значения в тегах
movies_with_tags.dropna(inplace=True)

In [11]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


In [20]:
movies_with_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683 entries, 0 to 9732
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    3683 non-null   int64  
 1   title      3683 non-null   object 
 2   genres     3683 non-null   object 
 3   userId     3683 non-null   float64
 4   tag        3683 non-null   object 
 5   timestamp  3683 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 201.4+ KB


#### Добавим к объединенным данным (фильмы+жанры) данные об оценках пользователей

In [19]:
movies_with_tags_est = pd.merge(movies_with_tags, ratings, on=['movieId', 'userId'])
movies_with_tags_est.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating,timestamp_y
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,4.0,1122227329
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,4.0,978575760
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.5,1525286001
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0,4.0,1528843890
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0,4.0,1528843890


In [21]:
movies_with_tags_est.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      3476 non-null   int64  
 1   title        3476 non-null   object 
 2   genres       3476 non-null   object 
 3   userId       3476 non-null   float64
 4   tag          3476 non-null   object 
 5   timestamp_x  3476 non-null   float64
 6   rating       3476 non-null   float64
 7   timestamp_y  3476 non-null   int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 244.4+ KB


In [24]:
# Посчитаем среднее значение оценки по фильму
avg_movie = movies_with_tags_est.groupby(['movieId'])['rating'].mean()
avg_movie.head()

movieId
1    3.833333
2    3.750000
3    2.500000
5    1.500000
7    3.000000
Name: rating, dtype: float64

In [26]:
# Посчитаем среднее значение оценки по пользователю
avg_user = movies_with_tags_est.groupby(['userId'])['rating'].mean()
avg_user.head()

userId
2.0     5.000
7.0     1.000
18.0    4.125
21.0    0.500
49.0    4.500
Name: rating, dtype: float64

In [61]:
# Добавим средние оценки по фильму в датафрейм
movies_with_tags_avg = pd.merge(movies_with_tags_est, avg_movie, on=['movieId'])
movies_with_tags_avg = movies_with_tags_avg.rename(columns = {'rating_y' : 'avg_movie'})

In [62]:
movies_with_tags_avg.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating_x,timestamp_y,avg_movie
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,4.0,1122227329,3.833333
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,4.0,978575760,3.833333
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.5,1525286001,3.833333
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0,4.0,1528843890,3.75
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0,4.0,1528843890,3.75


In [63]:
# Добавим средние оценки по пользователю в датафрейм
movies_with_tags_avg_user_movie = pd.merge(movies_with_tags_avg, avg_user, on=['userId'])
movies_with_tags_avg_user_movie = movies_with_tags_avg_user_movie.rename(columns = {'rating' : 'avg_user'})

In [64]:
movies_with_tags_avg_user_movie.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp_x,rating_x,timestamp_y,avg_movie,avg_user
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,4.0,1122227329,3.833333,3.777778
1,552,"Three Musketeers, The (1993)",Action Adventure Comedy Romance,336.0,knights,1139046000.0,3.0,1120568085,3.0,3.777778
2,1246,Dead Poets Society (1989),Drama,336.0,highschool,1139047000.0,4.5,1139046758,4.5,3.777778
3,32587,Sin City (2005),Action Crime FilmNoir Mystery Thriller,336.0,cult,1139047000.0,4.0,1139046729,4.0,3.777778
4,33660,Cinderella Man (2005),Drama Romance,336.0,boksdrama,1139046000.0,4.5,1139046157,4.25,3.777778


In [65]:
movies_with_tags_avg_user_movie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movieId      3476 non-null   int64  
 1   title        3476 non-null   object 
 2   genres       3476 non-null   object 
 3   userId       3476 non-null   float64
 4   tag          3476 non-null   object 
 5   timestamp_x  3476 non-null   float64
 6   rating_x     3476 non-null   float64
 7   timestamp_y  3476 non-null   int64  
 8   avg_movie    3476 non-null   float64
 9   avg_user     3476 non-null   float64
dtypes: float64(5), int64(2), object(3)
memory usage: 298.7+ KB


In [66]:
# Создаем датафрейм для модели и убираем ненужные столбцы
data = movies_with_tags_avg_user_movie.drop(['title', 'userId', 'timestamp_x', 'timestamp_y'], axis=1)

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    3476 non-null   int64  
 1   genres     3476 non-null   object 
 2   tag        3476 non-null   object 
 3   rating_x   3476 non-null   float64
 4   avg_movie  3476 non-null   float64
 5   avg_user   3476 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 190.1+ KB


#### Построим предсказательную модель

In [68]:
# Рассчитаем tfidf для текстовых столбцов
tfidf = TfidfVectorizer()
tag = tfidf.fit_transform((data['tag']))
genres = tfidf.fit_transform((data['genres']))

In [69]:
# Объединим столбцы
x = hstack([tag, genres, data[['movieId', 'rating_x', 'avg_movie', 'avg_user']]])

In [70]:
y = data['rating_x']

In [71]:
# Разделим выборку на тестовую и обучающую
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [72]:
# Обучаем модель
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [73]:
# Делаем предсказание и выводим результат
pred = model.predict(x_test)
print(mean_squared_error(pred, y_test, squared = False))

0.001293941011392972


In [74]:
model.score(x_test,y_test)

0.9999977746571536