Использовать dataset MovieLens
Построить рекомендации (регрессия, предсказываем оценку) на фичах:
TF-IDF на тегах и жанрах
Средние оценки (+ median, variance, etc.) пользователя и фильма
Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline


In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
# соберем общий датасет из полученных данных
df = pd.merge(left=movies, right=ratings[['userId', 'movieId','rating']], how='inner', on='movieId')
df = pd.merge(left=df, right=tags[['movieId', 'tag']], how='inner', on='movieId')

In [7]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,fun
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,pixar


In [8]:
# Отберем топ 10 пользователей, поставивших оценки

df[['userId','rating']].groupby('userId').count().sort_values('rating', ascending=False)[:10]

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
474,2455
414,2343
599,2100
68,1791
610,1701
274,1665
249,1635
448,1575
380,1536
305,1460


In [9]:
# посмотрим на пользователя, поставившего максимальное число оценок

df[df.userId == 474].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455 entries, 492 to 212440
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  2455 non-null   int64  
 1   title    2455 non-null   object 
 2   genres   2455 non-null   object 
 3   userId   2455 non-null   int64  
 4   rating   2455 non-null   float64
 5   tag      2455 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 134.3+ KB


In [10]:
user_474_index = df[df.userId == 474].index

Получим значения tfidf для жанров и тегов

In [11]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [12]:
movie_genres = [change_string(g) for g in df.genres.values]
tag_strings = [change_string(g) for g in df.tag.values]

In [13]:
genres_vectorizer, tag_vectorizer = TfidfVectorizer(), TfidfVectorizer()
X_movies_tfidf = genres_vectorizer.fit_transform(movie_genres)
X_tag_tfidf = tag_vectorizer.fit_transform(tag_strings)

In [14]:
def get_df_from_csr(csr, prefix):
    cols = [ f'{prefix}_{i}' for i in range(csr.shape[1])] 
    return pd.DataFrame( csr.todense(), columns=cols )

In [15]:
X_tag_tfidf[user_474_index]

<2455x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 2481 stored elements in Compressed Sparse Row format>

In [16]:
df.iloc[user_474_index, :].userId.unique()[0] == 474

True

In [17]:
df_474 = pd.concat([
    df[['movieId', 'userId', 'rating']].iloc[user_474_index, :], 
    get_df_from_csr(X_movies_tfidf[user_474_index], prefix='movie_tfidf').set_index(user_474_index), 
    get_df_from_csr(X_tag_tfidf[user_474_index], prefix='tag_tfidf').set_index(user_474_index), 
    ], axis=1)

In [18]:
sum(df_474.isna().sum())

0

In [19]:
df_474.drop_duplicates(subset=['movieId', 'userId', 'rating'], inplace=True)
df_474.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1198 entries, 492 to 212437
Columns: 1492 entries, movieId to tag_tfidf_1468
dtypes: float64(1490), int64(2)
memory usage: 13.6 MB


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [21]:
df_cols = list( set(df_474.columns) - set(['rating']) )
X_train, X_test, y_train, y_test = train_test_split(df_474[df_cols], df_474['rating'])

In [22]:
model = CatBoostRegressor(verbose=False)
model.fit(X_train, y_train);

In [23]:
print(f'train rmse : {np.sqrt(mean_squared_error(model.predict(X_train), y_train))}')
y_pred_first_iter = model.predict(X_test)
print(f'test rmse : {np.sqrt(mean_squared_error(y_pred_first_iter, y_test))}')


train rmse : 0.5345330605278764
test rmse : 0.7859633495048656


Добавим средние оценки для пользователя и фильма

In [24]:
df_movie_mean_rating = pd.DataFrame(df[['movieId', 'rating']].groupby('movieId').mean())
df_movie_mean_rating.columns = ['movie_mean_rating']

In [25]:
df_user_mean_rating = pd.DataFrame(df[['userId', 'rating']].groupby('userId').mean())
df_user_mean_rating.columns = ['user_mean_rating']

In [26]:
df = pd.merge(left=df, right=df_movie_mean_rating, how='inner', on='movieId')
df = pd.merge(left=df, right=df_user_mean_rating, how='inner', on='userId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,movie_mean_rating,user_mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar,3.92093,4.040472
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,pixar,3.92093,4.040472
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,fun,3.92093,4.040472
3,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,moldy,3.259615,4.040472
4,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,old,3.259615,4.040472


In [27]:
# соберем финальный датасет, используя полученные с помощью tf-idf оценки 
user_474_index = df[df.userId == 474].index
df_474 = pd.concat([
    df[['movieId', 'userId', 'rating', 'movie_mean_rating', 'user_mean_rating']].iloc[user_474_index, :], 
    get_df_from_csr(X_movies_tfidf[user_474_index], prefix='movie_tfidf').set_index(user_474_index), 
    get_df_from_csr(X_tag_tfidf[user_474_index], prefix='tag_tfidf').set_index(user_474_index), 
    ], axis=1)

In [28]:
df_474.head()

Unnamed: 0,movieId,userId,rating,movie_mean_rating,user_mean_rating,movie_tfidf_0,movie_tfidf_1,movie_tfidf_2,movie_tfidf_3,movie_tfidf_4,...,tag_tfidf_1459,tag_tfidf_1460,tag_tfidf_1461,tag_tfidf_1462,tag_tfidf_1463,tag_tfidf_1464,tag_tfidf_1465,tag_tfidf_1466,tag_tfidf_1467,tag_tfidf_1468
88596,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88597,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88598,1,474,4.0,3.92093,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88599,2,474,3.0,3.431818,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88600,2,474,3.0,3.431818,3.778004,0.0,0.0,0.0,0.0,0.791136,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_cols = list( set(df_474.columns) - set(['rating']) )
X_train, X_test, y_train, y_test = train_test_split(df_474[df_cols], df_474['rating'])

In [30]:
model = CatBoostRegressor(verbose=False)
model.fit(X_train, y_train);

In [31]:
print(f'train rmse : {np.sqrt(mean_squared_error(model.predict(X_train), y_train))}')
y_pred = model.predict(X_test)
print(f'test rmse : {np.sqrt(mean_squared_error(y_pred, y_test))}')

train rmse : 0.34157128189756814
test rmse : 0.5312216338295165
