In [1]:
import os

import numpy as np
import pandas as pd
import scipy.sparse as sp

from itertools import islice, cycle, product

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
del interactions
del movies_metadata
del interactions_filtered
del users_inv_mapping
del movies_inv_mapping
del movie_name_mapper
del recs

NameError: name 'interactions' is not defined

# 0. Загрузка усеченных данных (для теста функционала модели)

In [4]:
%%time
interactions = pd.read_csv(
    "test_sparse_matrix_dataset_ratings.dat",
    sep="::",
    engine="python",
    header=None,
    names=['userId','movieId','rating','timestamp'],
)
print(interactions.shape)
interactions.head(3)

(1000209, 4)
CPU times: total: 3.11 s
Wall time: 3.11 s


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [5]:
%%time
movies_metadata = pd.read_csv(
    "test_sparse_matrix_dataset_movies.dat",
    sep="::",
    engine="python",
    header=None,
    names=['movieId','title','genres'],
    encoding='latin-1'
)
print(movies_metadata.shape)
movies_metadata.head(3)

(3883, 3)
CPU times: total: 0 ns
Wall time: 7 ms


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


# 1. Загрузка БОЕВЫХ данных
(это долго и больно, поэтому запускать только если уверены)

%%time
interactions = pd.read_csv(
    "dataset/ratings.csv",
    sep=",",
    engine="python"
)
interactions.head(3)

%%time
movies_metadata = pd.read_csv(
    "dataset/movies.csv",
    sep=",",
    engine="python",
    encoding='latin-1'
)

movies_metadata.head(3)

In [6]:
print(f'''interactions: {interactions.shape}
movies_metadata: {movies_metadata.shape}''')

interactions: (1000209, 4)
movies_metadata: (3883, 3)


# ПРЕПРОЦЕССИНГ

In [7]:
movies_metadata.columns = ['id', 'title', 'genres']

In [8]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [9]:
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(1000209, 4) (1000209, 4)


In [10]:
movies_metadata.head(3)

Unnamed: 0,id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


#### Функция спарс-матрицы

In [11]:
def get_coo_matrix(
        df: pd.DataFrame, 
        user_col: str,
        item_col: str, 
        users_mapping: dict, 
        movies_mapping: dict,
        weight_col: str = None
        ):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(movies_mapping.get)
        )
    ))
    return interaction_matrix

In [12]:
# маппинг пользователей
users_inv_mapping = dict(enumerate(interactions_filtered['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)

6040

In [13]:
# маппинг фильмов
movies_inv_mapping = dict(enumerate(interactions_filtered['movieId'].unique()))
movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
len(movies_mapping)

3706

In [14]:
%%time
train_mat = get_coo_matrix(
    interactions_filtered,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

CPU times: total: 328 ms
Wall time: 328 ms


In [15]:
train_mat.size

1000209

# Обучение модельки - KNN - CosineRecommender

In [16]:
from implicit.nearest_neighbours import (
    CosineRecommender,
    BM25Recommender,
    TFIDFRecommender
    )

In [17]:
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)

  0%|          | 0/6040 [00:00<?, ?it/s]

#### Оценка модели

In [18]:
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 1, row number in matrix - 0


In [19]:
# маппинг для movieId and title
movie_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['title']))

In [20]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)

recs

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,2261.0,1.513339,2261,2582.0,Twin Dragons (Shuang long hui) (1992)
1,3649.0,1.512462,3649,396.0,Fall Time (1995)
2,3028.0,1.468497,3028,3133.0,Go West (1925)
3,2528.0,1.438797,2528,3371.0,Bound for Glory (1976)
4,4385.0,1.41865,4385,,
5,1263.0,1.404489,1263,3556.0,"Virgin Suicides, The (1999)"
6,3362.0,1.321948,3362,2197.0,Firelight (1997)
7,1319.0,1.305901,1319,3422.0,She's Gotta Have It (1986)
8,1245.0,1.303988,1245,2148.0,House (1986)
9,1883.0,1.254805,1883,23.0,Assassins (1995)
