In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [27]:
from scipy.sparse.linalg import svds

In [29]:
import pickle
import numpy as np

### Предобработка данных

In [2]:
movies = pd.read_csv("src/data/movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama
87581,292737,Shelter in Solitude (2023),Comedy|Drama
87582,292753,Orca (2023),Drama
87583,292755,The Angry Breed (1968),Drama


In [4]:
# Разделяю жанры
movies["genres"] = movies["genres"].apply(lambda x: x.split("|") if isinstance(x, str) else [])

In [5]:
# One-hot encoding жанров (19)
mlb = MultiLabelBinarizer()
genre_vectors = pd.DataFrame(mlb.fit_transform(movies["genres"]), columns=mlb.classes_)

In [6]:
# Добавляем обратно идентификаторы фильмов
movies_encoded = pd.concat([movies[["movieId", "title"]], genre_vectors], axis=1)

In [8]:
movies_encoded

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87581,292737,Shelter in Solitude (2023),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
87582,292753,Orca (2023),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87583,292755,The Angry Breed (1968),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
movies_encoded.to_csv("data/movies_encoded.csv", index=False)

## Расчет косинусного сходства

In [10]:
similarity_matrix = cosine_similarity(genre_vectors)

In [18]:
def get_similar_movies_c(movie_id, top_n=10):
    if movie_id not in movies_encoded["movieId"].values:
        return []
    
    idx = movies_encoded.index[movies_encoded["movieId"] == movie_id][0]
    similar_scores = list(enumerate(similarity_matrix[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    similar_movies = [(movies_encoded.iloc[i[0]]["movieId"], movies_encoded.iloc[i[0]]["title"]) for i in similar_scores]
    return similar_movies

## Инициализация модели Nearest Neighbors

In [19]:
nn_model = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model.fit(genre_vectors)

In [26]:
def get_similar_movies_nn(movie_id, top_n=10):
    if movie_id not in movies_encoded["movieId"].values:
        return []
    
    idx = movies_encoded.index[movies_encoded["movieId"] == movie_id][0]
    movie_vector = genre_vectors.iloc[idx].values.reshape(1, -1)
    distances, indices = nn_model.kneighbors(movie_vector, n_neighbors=top_n+1)
    similar_movies = [(movies_encoded.iloc[i]["movieId"], movies_encoded.iloc[i]["title"]) for i in indices[0][1:]]
    return similar_movies

## Тесты

In [27]:
get_similar_movies_nn(1) # 1 - История игрушеек



[(np.int64(103755), 'Turbo (2013)'),
 (np.int64(215229), 'Trolls World Tour (2020)'),
 (np.int64(247988), 'Luca (2021)'),
 (np.int64(131248), 'Brother Bear 2 (2006)'),
 (np.int64(91355), 'Asterix and the Vikings (Astérix et les Vikings) (2006)'),
 (np.int64(270294), 'Riverdance: The Animated Adventure (2021)'),
 (np.int64(206959), 'Frozen II (2019)'),
 (np.int64(175625), 'The Dragon Spell (2016)'),
 (np.int64(197743), 'Here Comes the Grump (2018)'),
 (np.int64(284739), 'Mavka: The Forest Song (2023)')]

In [23]:
get_similar_movies_c(1) # 1 - История игрушеек

[(np.int64(2294), 'Antz (1998)'),
 (np.int64(3114), 'Toy Story 2 (1999)'),
 (np.int64(3754), 'Adventures of Rocky and Bullwinkle, The (2000)'),
 (np.int64(4016), "Emperor's New Groove, The (2000)"),
 (np.int64(4886), 'Monsters, Inc. (2001)'),
 (np.int64(33463), 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)'),
 (np.int64(45074), 'Wild, The (2006)'),
 (np.int64(53121), 'Shrek the Third (2007)'),
 (np.int64(65577), 'Tale of Despereaux, The (2008)'),
 (np.int64(91355), 'Asterix and the Vikings (Astérix et les Vikings) (2006)')]

## SVD

In [35]:
# Загружаем данные
ratings = pd.read_csv("./src/data/ratings_train.dat", sep="::", names=["userId", "movieId", "rating", "timestamp"], engine="python")
movies = pd.read_csv("./src/data/movies.dat", sep="::", names=["movieId", "title", "genres"], engine="python", encoding="latin-1")

# Создаем матрицу пользователь-фильм
user_movie_matrix = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Преобразуем в numpy-массив
R = user_movie_matrix.values

# Вычисляем SVD (оставляем только k=50 сингулярных значений)
U, sigma, Vt = svds(R, k=50)
sigma = np.diag(sigma)

# Восстанавливаем приближенную матрицу рейтингов
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Преобразуем обратно в DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_movie_matrix.index, columns=user_movie_matrix.columns)

# Сохраняем модель
with open("svd_model.pkl", "wb") as f:
    pickle.dump(predicted_ratings_df, f)

print("Обучение завершено! Модель сохранена в svd_model.pkl")


Обучение завершено! Модель сохранена в svd_model.pkl


In [36]:
# Загружаем модель
with open("svd_model.pkl", "rb") as f:
    predicted_ratings_df = pickle.load(f)

# Функция для рекомендаций
def recommend_movies(user_id, N=10):
    if user_id not in predicted_ratings_df.index:
        return "Нет данных для пользователя"

    # Получаем предсказанные рейтинги для пользователя
    user_ratings = predicted_ratings_df.loc[user_id]

    # Фильтруем только те фильмы, которые пользователь НЕ смотрел
    watched_movies = ratings[ratings["userId"] == user_id]["movieId"].values
    recommendations = user_ratings.drop(index=watched_movies, errors="ignore")

    # Берем топ-N фильмов с наивысшими предсказанными рейтингами
    top_movies = recommendations.sort_values(ascending=False).head(N)

    # Добавляем названия фильмов
    top_movies = pd.merge(top_movies.reset_index(), movies, on="movieId")[["movieId", "title"]]

    return top_movies


In [38]:

# Пример: рекомендации для пользователя 1
print(recommend_movies(3, N=10))


   movieId                                             title
0     2791                                  Airplane! (1980)
1     2918                   Ferris Bueller's Day Off (1986)
2      919                          Wizard of Oz, The (1939)
3     2628  Star Wars: Episode I - The Phantom Menace (1999)
4     2804                         Christmas Story, A (1983)
5        1                                  Toy Story (1995)
6      110                                 Braveheart (1995)
7     1580                               Men in Black (1997)
8     2716                               Ghostbusters (1984)
9     1307                    When Harry Met Sally... (1989)
