### Домашняя работа по теме "Рекомендации на основе содержания"

1. Использовать dataset MovieLens https://grouplens.org/datasets/movielens/latest/


2. Построить рекомендации ( предсказываем оценку)
на фичах:
* TF IDF на тегах и жанрах
* Средние оценки median, variance, etc.) пользователя и фильма

3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')                 
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags['dt'] = tags['timestamp'].apply(lambda t: datetime.fromtimestamp(t))
tags['year'] = tags['dt'].dt.year
tags['month'] = tags['dt'].dt.month
tags['year_month'] = tags['year'].astype(str) + '-' + tags['month'].astype(str)

In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

Рассмотрим средние оценки по фильмам

In [9]:
user_rating = ratings
user_rating = user_rating[['movieId', 'rating']]
movie_table = pd.merge(user_rating, movies, on='movieId', how='outer')
movie_table = movie_table[['movieId', 'rating', 'genres']]
movie_table = movie_table[movie_table['rating'].notna()]
movie_table['genres'] = movie_table['genres'].apply(lambda x: change_string(x))
movie_rating = movie_table.groupby('movieId')['rating'].median()
movie_table = pd.merge(movie_rating, movies, on='movieId', how='outer')
movie_table = movie_table[movie_table['rating'].notna()]
movie_table

Unnamed: 0,movieId,rating,title,genres
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.0,Grumpier Old Men (1995),Comedy|Romance
3,4,3.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,3.0,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9719,193581,4.0,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9720,193583,3.5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9721,193585,3.5,Flint (2017),Drama
9722,193587,3.5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


Рассмотрим отдельно взятого пользователя (его оценки, тэги и т.д.)

In [10]:
def user_table(i):
    user_rating = ratings.loc[ratings['userId'] == i]
    user_rating = user_rating[['userId', 'movieId', 'rating']]
    user_tags = tags.loc[tags['userId'] == i]
    user_tags = user_tags[['movieId', 'tag']]
    user = pd.merge(user_rating, user_tags, on='movieId', how='outer')
    fin = pd.merge(user, movies, on='movieId', how='outer')
    fin = fin[['userId', 'movieId', 'rating', 'tag', 'genres']]
    fin = fin[fin['rating'].notna()]
    fin['genres'] = fin['genres'].apply(lambda x: change_string(x))
    fin['userId'] = fin['userId'].apply(lambda x: int(x))
    return fin

In [11]:
features = movie_table.iloc[:, [-1]]
results = movie_table.iloc[:, 1]

In [12]:
vec = TfidfVectorizer()

In [13]:
mat = vec.fit_transform(movie_table['genres'])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
cos_sim = cosine_similarity(mat)

Мы получили матрицу "похожести" фильмов, далее мы можем рассмотреть какой-то фильм/набор жанров и найти похожий (получить рекомендацию)

Для простоты рассмотрим 1 фильм в списке. Можно заметить, что мы никак не учитываем медианную оценку, для простоты будем рассматривать произведение коэффициента на оценку фильма

In [15]:
cos_sim = pd.DataFrame(np.row_stack(cos_sim))

In [16]:
rating = movie_table.iloc[:, 1].tolist()

In [17]:
cos_sim_mod = cos_sim.mul(rating, axis=0)
cos_sim_mod

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,4.000000,3.254110,0.610509,0.539995,1.070069,0.000000,0.610509,2.618372,0.000000,1.049490,...,1.442148,1.862705,0.785730,2.065362,0.0,2.721633,3.023926,0.000000,1.684882,1.070069
1,2.847346,3.500000,0.000000,0.000000,0.000000,0.000000,0.000000,2.816224,0.000000,1.128792,...,0.000000,0.000000,0.000000,0.000000,0.0,1.195151,1.327897,0.000000,0.000000,0.000000
2,0.457882,0.000000,3.000000,2.653501,1.711596,0.000000,3.000000,0.000000,0.000000,0.000000,...,0.488162,0.000000,1.256791,0.000000,0.0,0.545131,0.605678,0.000000,0.000000,1.711596
3,0.404997,0.000000,2.653501,3.000000,1.513908,0.000000,2.653501,0.000000,0.000000,0.000000,...,0.431780,0.604621,2.061758,0.000000,0.0,0.482168,0.535723,1.399618,0.000000,1.513908
4,0.802552,0.000000,1.711596,1.513908,3.000000,0.000000,1.711596,0.000000,0.000000,0.000000,...,0.855626,0.000000,2.202840,0.000000,0.0,0.955477,1.061603,0.000000,0.000000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,2.721633,1.365887,0.726841,0.642891,1.273970,0.957008,0.726841,0.000000,1.743288,0.963735,...,2.397312,2.217642,0.935451,2.458914,0.0,4.000000,3.600132,0.000000,3.014185,1.273970
9720,2.645935,1.327897,0.706625,0.625010,1.238537,0.000000,0.706625,0.000000,0.000000,0.000000,...,1.669193,2.155962,0.909433,2.390524,0.0,3.150115,3.500000,0.000000,1.950143,1.238537
9721,0.000000,0.000000,0.000000,1.632887,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.511965,2.375964,0.000000,0.0,0.000000,0.000000,3.500000,0.000000,0.000000
9722,1.474272,0.000000,0.000000,0.000000,0.000000,1.111255,0.000000,0.000000,2.024264,1.119067,...,2.361791,2.575073,0.000000,2.855233,0.0,2.637412,1.950143,0.000000,3.500000,0.000000


Теперь мы можем рассмотреть несколько (в данном случае 5) самых больших значений в колонке, соответствующей 1 фильму

In [18]:
best5 = cos_sim_mod.nlargest(6, [0])[1:]
best5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
9526,4.817765,4.221498,0.0,0.0,0.0,0.0,0.0,3.396767,0.0,1.361484,...,1.474951,2.416454,0.0,2.679357,0.0,3.088599,3.431652,0.0,2.185768,0.0
7899,4.544952,3.300189,0.839543,0.742576,1.471508,0.0,0.839543,2.140735,0.0,0.0,...,1.983172,2.561503,1.080499,2.840187,0.0,3.742659,4.158358,0.0,2.31697,1.471508
8656,4.544952,3.300189,0.839543,0.742576,1.471508,0.0,0.839543,2.140735,0.0,0.0,...,1.983172,2.561503,1.080499,2.840187,0.0,3.742659,4.158358,0.0,2.31697,1.471508
9518,4.544952,3.300189,0.839543,0.742576,1.471508,0.0,0.839543,2.140735,0.0,0.0,...,1.983172,2.561503,1.080499,2.840187,0.0,3.742659,4.158358,0.0,2.31697,1.471508
9542,4.544952,3.300189,0.839543,0.742576,1.471508,0.0,0.839543,2.140735,0.0,0.0,...,1.983172,2.561503,1.080499,2.840187,0.0,3.742659,4.158358,0.0,2.31697,1.471508


После чего вывести их названия

In [19]:
best5_ind = best5.index.values.tolist()
movie_table.loc[best5_ind]

Unnamed: 0,movieId,rating,title,genres
9526,172793,5.0,Vovka in the Kingdom of Far Far Away (1965),Adventure|Animation|Children|Fantasy
7899,95311,5.0,Presto (2008),Animation|Children|Comedy|Fantasy
8656,121781,5.0,Stuart Little 3: Call of the Wild (2005),Animation|Children|Comedy|Fantasy
9518,172577,5.0,Last Year's Snow Was Falling (1983),Animation|Children|Comedy|Fantasy
9542,173351,5.0,Wow! A Talking Fish! (1983),Animation|Children|Comedy|Fantasy


Это все можно записать в отдельную функцию

In [20]:
def sim_movie(n, k):
    best = cos_sim_mod.nlargest(k+1, [0])[1:]
    best_ind = best.index.values.tolist()
    best_table = movie_table.loc[best_ind]
    return best_table.iloc[:,2]

In [21]:
sim_movie(1, 5)

9526    Vovka in the Kingdom of Far Far Away (1965)
7899                                  Presto (2008)
8656       Stuart Little 3: Call of the Wild (2005)
9518            Last Year's Snow Was Falling (1983)
9542                    Wow! A Talking Fish! (1983)
Name: title, dtype: object

TF-idf

In [22]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(movie_table, test_size=0.2)

In [23]:
train_genres = [change_string(g) for g in train.genres.values]
test_genres = [change_string(g) for g in test.genres.values]

In [24]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.genres.tolist())

In [25]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
y_train = train.rating
y_test = test.rating
y_train = y_train.apply(lambda x: 1*x)
y_test = y_test.apply(lambda x: 1*x)

In [28]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf, y_train.astype(int))

MultinomialNB()

In [29]:
X_test_counts = count_vect.fit_transform(test.genres.tolist())
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

In [30]:
y_pred = mnb.predict(X_test_tfidf)

In [31]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report

In [32]:
y_train = y_train.tolist()
y_train = list(map(int, y_train))
y_test = y_test.tolist()
y_test = list(map(int, y_test))

In [33]:
rms = mean_squared_error(y_test, y_pred, squared=False)
rms

0.9633382404704482

In [34]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.93      0.44      0.59      1779
           4       0.14      0.48      0.21       166
           5       0.00      0.00      0.00         0

    accuracy                           0.44      1945
   macro avg       0.18      0.15      0.13      1945
weighted avg       0.86      0.44      0.56      1945



  _warn_prf(average, modifier, msg_start, len(result))
