### Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах
- Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [308]:
import pandas as pd
import numpy as np
import statistics
from sklearn.model_selection import train_test_split

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from scipy.sparse import hstack
from sklearn.metrics import classification_report

%matplotlib inline

In [114]:
links = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/links.csv')
movies = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/movies.csv')
ratings = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/ratings.csv')
tags = pd.read_csv('/Users/kseniagorchakova/Library/Mobile Documents/com~apple~CloudDocs/Education/!Profession_DS/Рекомендательные системы/Лекция_2/tags.csv')

In [115]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [116]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [117]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


#### Добавим к данным о жанрах информацию о тегах

In [177]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [179]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [180]:
movies_with_tags['genres'] = [change_string(g) for g in movies_with_tags.genres.values]

In [181]:
# Удалим пустые значения в тегах
movies_with_tags.dropna(inplace=True)

In [182]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


In [129]:
movies_with_tags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3683 entries, 0 to 9732
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    3683 non-null   int64  
 1   title      3683 non-null   object 
 2   genres     3683 non-null   object 
 3   userId     3683 non-null   float64
 4   tag        3683 non-null   object 
 5   timestamp  3683 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 201.4+ KB


#### Сгруппируем датафрейм по id фильму и его названию

In [130]:
# Удалить дату и id пользователя
movies_with_tags_groupby = movies_with_tags.drop(['userId', 'timestamp'], axis=1)

In [187]:
# Сгруппируем данные
movies_with_tags_groupby = movies_with_tags.groupby(['movieId', 'title', 'genres'], as_index=False)['tag'].apply(lambda x: ' '.join(x)).reset_index()

In [188]:
movies_with_tags_groupby.head()

Unnamed: 0,index,movieId,title,genres,tag
0,0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game
2,2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
3,3,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
4,4,7,Sabrina (1995),Comedy Romance,remake


#### Посчитаем статистики для каждого фильма (среднее значение, медиана, дисперсия)

In [190]:
movie_rating = ratings.drop(['userId', 'timestamp'], axis=1)
movie_rating.head()

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0


In [191]:
ratings_groupby = movie_rating.groupby(['movieId'], as_index=False).agg([np.mean, np.median, np.var])
ratings_groupby.head()

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,mean,median,var
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,3.92093,4.0,0.69699
2,3.431818,3.5,0.777419
3,3.259615,3.0,1.112651
4,2.357143,3.0,0.72619
5,3.071429,3.0,0.822917


In [192]:
ratings_groupby = ratings_groupby.reset_index()
ratings_groupby.columns = [' '.join(col).strip() for col in ratings_groupby.columns.values]

In [193]:
ratings_groupby.head()

Unnamed: 0,movieId,rating mean,rating median,rating var
0,1,3.92093,4.0,0.69699
1,2,3.431818,3.5,0.777419
2,3,3.259615,3.0,1.112651
3,4,2.357143,3.0,0.72619
4,5,3.071429,3.0,0.822917


#### Соединим данные по фильмам и их оценкам от пользователей

In [194]:
movies_tags_est = movies_with_tags_groupby.join(ratings_groupby.set_index('movieId'), on='movieId')

In [195]:
movies_tags_est.head()

Unnamed: 0,index,movieId,title,genres,tag,rating mean,rating median,rating var
0,0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,3.92093,4.0,0.69699
1,1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,3.431818,3.5,0.777419
2,2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,3.259615,3.0,1.112651
3,3,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,3.071429,3.0,0.822917
4,4,7,Sabrina (1995),Comedy Romance,remake,3.185185,3.0,0.955625


In [196]:
movies_tags_est.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572 entries, 0 to 1571
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          1572 non-null   int64  
 1   movieId        1572 non-null   int64  
 2   title          1572 non-null   object 
 3   genres         1572 non-null   object 
 4   tag            1572 non-null   object 
 5   rating mean    1554 non-null   float64
 6   rating median  1554 non-null   float64
 7   rating var     1395 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 98.4+ KB


In [197]:
# Создаем датафрейм без пустых значений
data = movies_tags_est.dropna()

In [198]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1395 entries, 0 to 1570
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          1395 non-null   int64  
 1   movieId        1395 non-null   int64  
 2   title          1395 non-null   object 
 3   genres         1395 non-null   object 
 4   tag            1395 non-null   object 
 5   rating mean    1395 non-null   float64
 6   rating median  1395 non-null   float64
 7   rating var     1395 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 98.1+ KB


#### Построим предсказательную модель

In [297]:
# Рассчитаем tfidf для текстовых столбцов
tfidf = TfidfVectorizer()
tag = tfidf.fit_transform((data['tag']))
genres = tfidf.fit_transform((data['genres']))

In [299]:
# Объединим столбцы
x = hstack([tag, genres, data[['movieId', 'rating median', 'rating var']]])

In [300]:
y = data['rating mean']

In [301]:
# Разделим выборку на тестовую и обучающую
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [302]:
# Обучаем модель
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [309]:
# Делаем предсказание и выводим результат
pred = model.predict(x_test)
print(mean_squared_error(pred, y_test))

0.05445214949807785


In [312]:
model.score(x_test,y_test)

0.7694746286067153