In [1]:
import pandas as pd
from pandarallel import pandarallel
import numpy as np
from tqdm import tqdm
import datetime
from pprint import pprint
from time import time

from rectools.dataset.interactions import Interactions
from rectools.dataset import Dataset
from rectools import Columns


from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender

from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection.time_split import TimeRangeSplitter




In [2]:
tqdm.pandas()
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Загрузка, знакомство, подготовка

**Взаимодействия пользователей с фильмами**

In [3]:
data = pd.read_csv('ml-latest/ratings.csv')

In [4]:
data.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
32929398,321972,109487,4.0,1581583188
11189845,110292,348,3.0,835532205
33358911,326492,4878,4.5,1478892765
14223118,139666,5218,3.5,1442040791
31592259,309143,2396,4.0,1119559190


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [6]:
print(f'Испльзовано памяти: {data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}mb')

Испльзовано памяти: 1032.48mb


In [7]:
#оптимизация хранения данных
data['userId'] = data['userId'].astype('int32')
data['movieId'] = data['movieId'].astype('int32')
data['rating'] = data['rating'].astype('float16')
#изменение хранения дат 
data['timestamp'] = pd.to_datetime(data['timestamp'].parallel_apply(lambda x: pd.Timestamp(x, unit='s').date()))

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int32         
 1   movieId    int32         
 2   rating     float16       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float16(1), int32(2)
memory usage: 580.8 MB


In [9]:
print(f'Испльзовано памяти: {data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}mb')

Испльзовано памяти: 580.77mb


**Названия фильмов и imbdID**

In [10]:
movies_ml = pd.read_csv('ml-latest/movies.csv')

In [11]:
movies_ml.sample(5)

Unnamed: 0,movieId,title,genres
86060,287353,Patterns of Evidence: Journey to Mount Sinai (...,Documentary
58795,197313,A Geisha (1953),Drama
76934,254867,Heir (2021),Horror
15738,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
53953,186975,Love Is a Gun (1994),Crime|Drama|Mystery


In [12]:
links_ml = pd.read_csv('ml-latest/links.csv')

In [13]:
links_ml.sample(5)

Unnamed: 0,movieId,imdbId,tmdbId
72394,233989,8360266,572292.0
71121,228745,1708459,78059.0
44964,168174,6090102,432959.0
81105,272055,921766,159457.0
1438,1483,115964,884.0


## Постановка задачи и baseline

Используя историю взаимодействий пользователей с объектами создать двухэтапную рекомендательную модель, которая значительно превзойдёт базовую по совокупности метрик Serendipity и MeanInvUserFreq (способность удивлять непопулярными релевантными объектами) на валидации, а также удовлетворит меня в ходе тестирования (выборочный визуальный анализ).

Валидировать будем на данных за последние 40 дней, разбив их на 5 фолдов по 8 дней. Реализовывать кросс-валидацию будем средствами библиотеки RecTools (https://github.com/MobileTeleSystems/RecTools). 

In [14]:
# Для более эффективного использования сохраним данные о взаимодействиях в классе Interactions 
data.columns = [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]
interactions = Interactions(data)
interactions.df.sample(2)

Unnamed: 0,user_id,item_id,weight,datetime
27683288,270232,508,5.0,2006-04-25
19080700,187039,122924,3.0,2021-01-16


**Перекрёстная валидация**

Мы будем использовать последние 5 периодов по восемь дней в каждом. 

In [15]:
n_splits=5
cv = TimeRangeSplitter(test_size='8D', n_splits=n_splits, 
                      filter_cold_users=False,
                      filter_cold_items=True)
cv.get_test_fold_borders(interactions)

[(Timestamp('2023-06-11 00:00:00', freq='8D'),
  Timestamp('2023-06-19 00:00:00', freq='8D')),
 (Timestamp('2023-06-19 00:00:00', freq='8D'),
  Timestamp('2023-06-27 00:00:00', freq='8D')),
 (Timestamp('2023-06-27 00:00:00', freq='8D'),
  Timestamp('2023-07-05 00:00:00', freq='8D')),
 (Timestamp('2023-07-05 00:00:00', freq='8D'),
  Timestamp('2023-07-13 00:00:00', freq='8D')),
 (Timestamp('2023-07-13 00:00:00', freq='8D'),
  Timestamp('2023-07-21 00:00:00', freq='8D'))]

**Метрики**

In [16]:
K_RECOS = 20
metrics = {
    "serendipity": Serendipity(k=K_RECOS),
    "MeanInvUserFreq": MeanInvUserFreq(k=K_RECOS),
    "prec@20": Precision(k=K_RECOS),
    "recall": Recall(k=K_RECOS)
}
results_df = pd.DataFrame({key:[] for key in metrics.keys()})
results_df

Unnamed: 0,serendipity,MeanInvUserFreq,prec@20,recall


**Бейзлайн (рекомендуем популярное)**

`PopularRecommender` будет рекомендовать 20 самых популярных фильмов за последний месяц:

In [17]:
class PopularRecommender():
    def __init__(self, max_K=100, days=30, item_column='item_id', dt_column='date', user_column='user_id', parallel=False):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.user_column = user_column
        self.dt_column = dt_column
        self.parallel = parallel
        self.recommendations = []
        
    def fit(self, df, ):
        self.df = df
        self.users = df[self.user_column].unique()
        
        min_date = df[self.dt_column].max().normalize() - pd.DateOffset(days=self.days)
        
        self.recommendations = df.loc[df[self.dt_column] > min_date, self.item_column].value_counts().head(self.max_K).index.values 
    
    def recommend(self, users=None, N=10, one_person=False, person_movie_list=None):
        if users is None:
            return self.recommendations[:N]
            
        if isinstance(users, np.ndarray):
            users = pd.Series(users)
            if self.parallel:
                recs = users.parallel_apply(lambda x: np.setdiff1d(self.recommendations,self.df[self.df[self.user_column] == x][self.item_column].values)[:N] if x in self.users else self.recommendations[:N].tolist())
            else:
                recs = users.progress_apply(lambda x: np.setdiff1d(self.recommendations,self.df[self.df[self.user_column] == x][self.item_column].values)[:N] if x in self.users else self.recommendations[:N].tolist())
            
            recs = pd.DataFrame(recs)
            recs[Columns.User] = users
            recs.columns = [Columns.Item, Columns.User]
            recs = recs.explode(column='item_id', ignore_index=True)
            
            ranks = []
            for i in recs['user_id'].unique():
                n = len(recs[recs['user_id']==i]['item_id'])
                ranks += [j for j in range(1, n +1)]
            recs[Columns.Rank] = ranks
            
            return recs

In [48]:
def validation_func(interactions=None,cv=None , model=None, metrics=None, k_recs=None):
    split_results = pd.DataFrame({key:[] for key in metrics.keys()})
    
    for train_ind, test_ind, _ in tqdm(cv.split(interactions)):
        train_df = interactions.df.loc[train_ind, :]
        df_test = interactions.df.loc[test_ind, :]
        users_test = df_test[Columns.User].unique()

        model.fit(train_df)

        recommendations = model.recommend(users=users_test, N=k_recs)

        results = calc_metrics(metrics,
                               reco=recs,
                               interactions=df_test,
                               prev_interactions=train_df,
                               catalog=train_df[Columns.Item].unique()
                               )
        results = pd.DataFrame(results, index=[0])

        split_results = pd.concat([split_results, res], ignore_index=True)

    split_results['model'] ='PopularModel'
    del model 
    
    return split_results.groupby('model').agg([np.mean, np.std])

In [49]:
params = {'days':30,
         'item_column':Columns.Item,
         'dt_column':Columns.Datetime,
         'user_column':Columns.User, 
         'parallel':True}

In [50]:
model = PopularRecommender(**params)

In [51]:
validation_func(interactions=interactions,
                cv=cv,
                model=model,
                k_recs=K_RECOS,
                metrics=metrics)

5it [01:31, 18.21s/it]


Unnamed: 0_level_0,serendipity,serendipity,MeanInvUserFreq,MeanInvUserFreq,prec@20,prec@20,recall,recall
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
PopularModel,0.000333,0.0,3.664814,0.0,0.053558,0.0,0.074606,0.0
