In [1]:
import numpy as np

import pandas as pd

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from polara import get_movielens_data # https://github.com/evfro/polara


# Подготовка данных
Используется датасет Movielens-25M (Используйте Movielens-1M для меньшей нагрузки)

In [2]:
ratings, movies = get_movielens_data(
    'D:\Пользователь\Downloads\ml-25m.zip',
    get_genres=True, split_genres=False
)

Имеются описания фильмов:

In [3]:
movies = movies.set_index('movieid')
movies.head()

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


И рейтинги:

In [5]:
ratings.head()

Unnamed: 0,userid,movieid,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


Количество оценок:

In [6]:
ratings.shape[0]

25000095

Количество пользователей и фильмов:

In [7]:
ratings[['userid', 'movieid']].apply(pd.Series.nunique)

userid     162541
movieid     59047
dtype: int64

In [8]:
print(f"Fraction of known elements: {ratings.shape[0] / np.prod(ratings[['userid', 'movieid']].apply(pd.Series.nunique)) * 100:.2f}%")

Fraction of known elements: 0.26%


Выбор любимых фильмов:

In [9]:
def find_movie(name: str):
    return movies.loc[movies.movienm.str.contains(name, flags=2)]

find_movie('Ghost')

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller
741,Ghost in the Shell (KÃ´kaku kidÃ´tai) (1995),Animation|Sci-Fi
943,"Ghost and Mrs. Muir, The (1947)",Drama|Fantasy|Romance
1049,"Ghost and the Darkness, The (1996)",Action|Adventure
1401,Ghosts of Mississippi (1996),Drama
...,...,...
204162,Little Ghost (1997),Adventure|Children|Fantasy
204936,Pirates of Ghost Island (2007),Horror
206180,Warren Ellis: Captured Ghosts (2011),Documentary
207654,Ghost Ship (2015),Horror


In [10]:
find_movie('Star Wars')

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
5378,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
33493,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi
61160,Star Wars: The Clone Wars (2008),Action|Adventure|Animation|Sci-Fi
79006,Empire of Dreams: The Story of the 'Star Wars'...,Documentary
109713,Star Wars: Threads of Destiny (2014),Action|Adventure|Sci-Fi
122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX


In [11]:
find_movie('pretty')

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
597,Pretty Woman (1990),Comedy|Romance
2145,Pretty in Pink (1986),Comedy|Drama|Romance
4031,All the Pretty Horses (2000),Drama|Romance|Western
6552,Dirty Pretty Things (2002),Crime|Drama|Thriller
8499,Pretty Baby (1978),Drama
26941,"Pretty Village, Pretty Flame (Lepa sela lepo g...",Drama|War
32783,Pretty Maids All in a Row (1971),Comedy|Crime|Thriller
34540,Pretty Persuasion (2005),Comedy|Drama
48035,Pretty Poison (1968),Comedy|Crime|Romance|Thriller
49464,Sitting Pretty (1948),Comedy


In [13]:
favorite_movies_ids = [587, 1196, 597, 2716]

movies.loc[favorite_movies_ids]

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
597,Pretty Woman (1990),Comedy|Romance
2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi


# RecSys в 3 шага

### 1. Строим разреженную матрицу из данных об оценках

In [14]:
data_matrix = csr_matrix(
    (ratings.rating.values.astype('f8'),
    (ratings.userid.values, ratings.movieid.values) )
)

### 2. Вычисляем разреженное SVD

In [15]:
RANK = 50

_, S, Vt = svds(data_matrix, k=RANK, return_singular_vectors='vh') # Долго для большого датасета или ранга

### 3. Генерируем top-k рекомендаций, основанных на известных предпочтениях пользователя

In [16]:
top_k = 15 # Задаём количество рекомендаций в выдаче

movies.loc[np.argsort(-Vt.T @ Vt[:, favorite_movies_ids].sum(axis=1))[:top_k]]

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
597,Pretty Woman (1990),Comedy|Romance
539,Sleepless in Seattle (1993),Comedy|Drama|Romance
587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller
500,Mrs. Doubtfire (1993),Comedy|Drama
357,Four Weddings and a Funeral (1994),Comedy|Romance
1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
527,Schindler's List (1993),Drama|War


Ура!


Ещё эксперемент:

In [23]:
find_movie('The Golden Calf')[:5]

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
133361,The Golden Calf (1968),Comedy|Drama


In [19]:
find_movie('Peaceful Warrior')[:5]

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
45503,Peaceful Warrior (2006),Drama


In [22]:
favorite_movies_ids_2 = [133361, 45503]

movies.loc[np.argsort(-Vt.T @ Vt[:, favorite_movies_ids_2].sum(axis=1))[:top_k+10]]

Unnamed: 0_level_0,movienm,genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1
1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
1265,Groundhog Day (1993),Comedy|Fantasy|Romance
356,Forrest Gump (1994),Comedy|Drama|Romance|War
3147,"Green Mile, The (1999)",Crime|Drama
92259,Intouchables (2011),Comedy|Drama
4995,"Beautiful Mind, A (2001)",Drama|Romance
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
47610,"Illusionist, The (2006)",Drama|Fantasy|Mystery|Romance
64957,"Curious Case of Benjamin Button, The (2008)",Drama|Fantasy|Mystery|Romance
2011,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
