## Загружаем необходимые библиотеки

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

## Загружаем данные

Собраны данные:

ratings
- rating: рейтинг фильма

movies
- title: название фильма
- genres: жанры

tags
- tag: теги


In [None]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

Построим рекомендации (регрессию, предскажем оценку) на фичах:
- TF-IDF на тегах и жанрах;
- средние оценки пользователя и фильма.

In [None]:
ratings.head()

In [None]:
links.head()

In [None]:
movies.head()

In [None]:
tags.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
def change_string(s):
    return s.replace(' ', '').replace('-', '')

In [None]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

In [None]:
tfidf_genres = TfidfVectorizer()
X_genres = tfidf_genres.fit_transform(movie_genres)
X_genres

In [None]:
tfidf_genres.get_feature_names_out()

In [None]:
pd.DataFrame(X_genres.toarray(), columns=tfidf_genres.get_feature_names_out())

In [None]:
def change_string_tags(s):
    return s.replace('-', '')

In [None]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

In [None]:
movies_with_tags.tag.unique()

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

In [None]:
tag_strings = []
movies_strings = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string_tags(s) for s in group.tag.values]))
    movies_strings.append(movie)

In [None]:
tfidf_tags = TfidfVectorizer()
X_tags = tfidf_tags.fit_transform(tag_strings)

In [None]:
movie_means = ratings.groupby('movieId')['rating'].mean().reset_index()
movie_means.rename(columns={'rating': 'movie_mean'}, inplace=True)

In [None]:
user_means = ratings.groupby('userId')['rating'].mean().reset_index()
user_means.rename(columns={'rating': 'user_mean'}, inplace=True)

In [None]:
data = movies_with_tags.merge(movie_means, on='movieId')
data.head()

In [None]:
data = data.merge(user_means, on='userId')
data.head()

In [None]:
X_genres_df = pd.DataFrame(X_genres.toarray(), index=movies['movieId'])
X_genres_df.head()


In [None]:
movies_tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join([change_string_tags(t) for t in x])).reset_index()

In [None]:
tfidf_tags = TfidfVectorizer()
X_tags = tfidf_tags.fit_transform(movies_tags_grouped['tag'])

In [None]:
X_tags_df = pd.DataFrame(X_tags.toarray(), index=movies_tags_grouped['movieId'].values)
X_tags_df.head()

In [None]:
data = data.merge(X_genres_df, left_on='movieId', right_index=True)
data = data.merge(X_tags_df, left_on='movieId', right_index=True)
data.head()

In [None]:
data_for_model = ratings.merge(data, on=['userId','movieId'], how='left')

X = data_for_model.drop(columns=['rating', 'timestamp', 'title', 'genres', 'tag'], errors='ignore')
y = data_for_model['rating']

data_for_model.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor #работает с Nan

model = HistGradientBoostingRegressor(max_iter=50, max_depth=5, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [None]:
#RMSE на тестовой выборке
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print('RMSE модели:', rmse)
print("Вывод: модель предсказывает рейтинги фильмов с ошибкой примерно ±1 балл на шкале от 0.5 до 5.0. Это приемлемый результат.\nДля реального использования рекомендаций этого часто достаточно, потому что пользователю важен ранжированный список фильмов, а не точность конкретной цифры рейтинга.")


In [None]:
#Функция для рекомендации похожего фильма - content-based рекомендация
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X_genres, X_genres)

def get_recommendations(title, cosine_sim, movies_df, top_n=5):
    # Индекс фильма
    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return movies_df['title'].iloc[movie_indices]

print("Рекомендованные фильмы (похожие) для тех кто смотрел фильм Matrix, The (1999):\n")
print(get_recommendations("Matrix, The (1999)", cosine_sim, movies))
