## Загружаем необходимые библиотеки

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

## Загружаем данные

Собраны данные:

ratings
- rating: рейтинг фильма

movies
- title: название фильма
- genres: жанры

tags
- tag: теги


In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

Построим рекомендации (регрессию, предскажем оценку) на фичах:
- TF-IDF на тегах и жанрах;
- средние оценки пользователя и фильма.

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [8]:
def change_string(s):
    return s.replace(' ', '').replace('-', '')

In [9]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure|Animation|Children|Comedy|Fantasy',
 'Adventure|Children|Fantasy',
 'Comedy|Romance',
 'Comedy|Drama|Romance',
 'Comedy',
 'Action|Crime|Thriller',
 'Comedy|Romance',
 'Adventure|Children',
 'Action',
 'Action|Adventure|Thriller']

In [10]:
tfidf_genres = TfidfVectorizer()
X_genres = tfidf_genres.fit_transform(movie_genres)
X_genres

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22084 stored elements and shape (9742, 20)>

In [11]:
tfidf_genres.get_feature_names_out()

array(['action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'imax',
       'musical', 'mystery', 'nogenreslisted', 'romance', 'scifi',
       'thriller', 'war', 'western'], dtype=object)

In [12]:
pd.DataFrame(X_genres.toarray(), columns=tfidf_genres.get_feature_names_out())

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [13]:
def change_string_tags(s):
    return s.replace('-', '')

In [14]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [15]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      shape=(1589,), dtype=object)

In [16]:
movies_with_tags.dropna(inplace=True)

In [17]:
movies_with_tags.title.unique().shape

(1572,)

In [18]:
tag_strings = []
movies_strings = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string_tags(s) for s in group.tag.values]))
    movies_strings.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [19]:
tfidf_tags = TfidfVectorizer()
X_tags = tfidf_tags.fit_transform(tag_strings)

In [20]:
movie_means = ratings.groupby('movieId')['rating'].mean().reset_index()
movie_means.rename(columns={'rating': 'movie_mean'}, inplace=True)

In [21]:
user_means = ratings.groupby('userId')['rating'].mean().reset_index()
user_means.rename(columns={'rating': 'user_mean'}, inplace=True)

In [22]:
data = movies_with_tags.merge(movie_means, on='movieId')
data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movie_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,3.92093
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,3.92093
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,3.92093
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929,3.431818
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932,3.431818


In [23]:
data = data.merge(user_means, on='userId')
data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movie_mean,user_mean
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,3.92093,4.321429
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,3.92093,3.398956
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,3.92093,2.245455
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929,3.431818,4.081967
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932,3.431818,4.081967


In [24]:
X_genres_df = pd.DataFrame(X_genres.toarray(), index=movies['movieId'])
X_genres_df.head()


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
movies_tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join([change_string_tags(t) for t in x])).reset_index()

In [26]:
tfidf_tags = TfidfVectorizer()
X_tags = tfidf_tags.fit_transform(movies_tags_grouped['tag'])

In [27]:
X_tags_df = pd.DataFrame(X_tags.toarray(), index=movies_tags_grouped['movieId'].values)
X_tags_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1738,1739,1740,1741,1742,1743,1744,1745,1746,1747
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
data = data.merge(X_genres_df, left_on='movieId', right_index=True)
data = data.merge(X_tags_df, left_on='movieId', right_index=True)
data.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,movie_mean,user_mean,0_x,1_x,...,1738,1739,1740,1741,1742,1743,1744,1745,1746,1747
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764,3.92093,4.321429,0.0,0.416846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825,3.92093,3.398956,0.0,0.416846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013,3.92093,2.245455,0.0,0.416846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929,3.431818,4.081967,0.0,0.512361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932,3.431818,4.081967,0.0,0.512361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
data_for_model = ratings.merge(data, on=['userId','movieId'], how='left')

X = data_for_model.drop(columns=['rating', 'timestamp', 'title', 'genres', 'tag'], errors='ignore')

y = data_for_model['rating']

data_for_model.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y,movie_mean,user_mean,...,1738,1739,1740,1741,1742,1743,1744,1745,1746,1747
0,1,1,4.0,964982703,,,,,,,...,,,,,,,,,,
1,1,3,4.0,964981247,,,,,,,...,,,,,,,,,,
2,1,6,4.0,964982224,,,,,,,...,,,,,,,,,,
3,1,47,5.0,964983815,,,,,,,...,,,,,,,,,,
4,1,50,5.0,964982931,,,,,,,...,,,,,,,,,,


In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [32]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor #работает с Nan

model = HistGradientBoostingRegressor(max_iter=50, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,50
,max_leaf_nodes,31
,max_depth,5
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [38]:
#RMSE на тестовой выборке
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print('RMSE модели: ', rmse)
print("Вывод: модель предсказывает рейтинги фильмов с ошибкой примерно ±1 балл на шкале от 0.5 до 5.0. Это приемлемый результат.\nДля реального использования рекомендаций этого часто достаточно, потому что пользователю важен ранжированный список фильмов, а не точность конкретной цифры рейтинга.")


RMSE модели:  0.9586811896375981
Вывод: модель предсказывает рейтинги фильмов с ошибкой примерно ±1 балл на шкале от 0.5 до 5.0. Это приемлемый результат.
Для реального использования рекомендаций этого часто достаточно, потому что пользователю важен ранжированный список фильмов, а не точность конкретной цифры рейтинга.


In [34]:
#Функция для рекомендации похожего фильма - content-based рекомендация
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X_genres, X_genres)


def get_recommendations(title, cosine_sim, movies_df, top_n=5):
    # Индекс фильма
    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return movies_df['title'].iloc[movie_indices]

print("Рекомендованные фильмы (похожие) для тех кто смотрел фильм Matrix, The (1999):\n")
print(get_recommendations("Matrix, The (1999)", cosine_sim, movies))


Рекомендованные фильмы (похожие) для тех кто смотрел фильм Matrix, The (1999):

68           Screamers (1995)
144    Johnny Mnemonic (1995)
296         Virtuosity (1995)
336            Timecop (1994)
474       Blade Runner (1982)
Name: title, dtype: object
