In [17]:
from rectools.dataset import Dataset
from rectools import Columns

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
!ls -lah ml-latest

итого 1,5G
drwxrwxr-x. 1 micoff micoff  160 ноя  8 17:02 .
drwxr-xr-x. 1 micoff micoff  156 ноя  8 23:18 ..
-rw-rw-r--. 1 micoff micoff 498M июл 20 22:44 genome-scores.csv
-rw-rw-r--. 1 micoff micoff  18K июл 20 22:44 genome-tags.csv
-rw-rw-r--. 1 micoff micoff 1,9M июл 20 22:59 links.csv
-rw-rw-r--. 1 micoff micoff 4,0M июл 20 22:52 movies.csv
-rw-rw-r--. 1 micoff micoff 891M июл 20 22:14 ratings.csv
-rw-rw-r--. 1 micoff micoff 9,7K июл 20 23:01 README.txt
-rw-rw-r--. 1 micoff micoff  82M июл 20 22:39 tags.csv


### Загрузка, знакомство, подготовка

**Взаимодействия пользователей с фильмами**

In [42]:
data = pd.read_csv('ml-latest/ratings.csv')

In [43]:
data.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
14246715,139919,90522,0.5,1434781713
24931531,243002,6743,4.0,1143406226
4469634,43628,5388,4.0,1598974977
20506809,200130,586,3.0,977931874
30101401,294212,586,3.0,1466825075


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [7]:
print(f'Испльзовано памяти: {data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}mb')

Испльзовано памяти: 1032.48mb


In [45]:
#оптимизация хранения данных
data['userId'] = data['userId'].astype('int32')
data['movieId'] = data['movieId'].astype('int32')
data['rating'] = data['rating'].astype('float16')
#изменение хранения дат 
data['timestamp'] = pd.to_datetime(data['timestamp'].progress_apply(lambda x: pd.Timestamp(x, unit='s').date()))

100%|████████████████████████████| 33832162/33832162 [00:57<00:00, 584670.54it/s]


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int32         
 1   movieId    int32         
 2   rating     float16       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float16(1), int32(2)
memory usage: 580.8 MB


In [47]:
print(f'Испльзовано памяти: {data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}mb')

Испльзовано памяти: 580.77mb


**Названия фильмов и imbdID**

In [11]:
movies_ml = pd.read_csv('ml-latest/movies.csv')

In [12]:
movies_ml.sample(5)

Unnamed: 0,movieId,title,genres
71030,228383,The Last Exorcist (2020),Thriller
15553,81930,Toi et Moi (2006),Comedy|Drama|Romance
54980,189129,What We Started (2018),Documentary
83699,280198,Shtemp (1991),Action
46409,171229,Re-Animated (2006),Children|Comedy


In [13]:
links_ml = pd.read_csv('ml-latest/links.csv')

In [14]:
links_ml.sample(5)

Unnamed: 0,movieId,imdbId,tmdbId
57860,195327,2974404,266182.0
74455,243406,40751,37504.0
70466,226474,2190316,180779.0
84403,281936,5529576,999850.0
18231,95309,1307068,88005.0


## Постановка задачи 

In [55]:
data.columns = [Columns.User, Columns.Item, Columns.Weight, Columns.Datetime]

In [56]:
data

Unnamed: 0,user_id,item_id,weight,datetime
0,1,1,4.0,2008-11-03
1,1,110,4.0,2008-11-05
2,1,158,4.0,2008-11-03
3,1,260,4.5,2008-11-03
4,1,356,5.0,2008-11-03
...,...,...,...,...
33832157,330975,8340,2.0,2004-08-04
33832158,330975,8493,2.5,2004-08-04
33832159,330975,8622,4.0,2004-08-04
33832160,330975,8665,3.0,2004-08-04


In [57]:
dataset = Dataset.construct(data)

In [58]:
dataset

Dataset(user_id_map=IdMap(external_ids=array([     1,      2,      3, ..., 330973, 330974, 330975], dtype=int32)), item_id_map=IdMap(external_ids=array([     1,    110,    158, ..., 228179, 261553, 269782], dtype=int32)), interactions=Interactions(df=          user_id  item_id  weight   datetime
0               0        0     4.0 2008-11-03
1               0        1     4.0 2008-11-05
2               0        2     4.0 2008-11-03
3               0        3     4.5 2008-11-03
4               0        4     5.0 2008-11-03
...           ...      ...     ...        ...
33832157   330974     1599     2.0 2004-08-04
33832158   330974     5298     2.5 2004-08-04
33832159   330974      854     4.0 2004-08-04
33832160   330974      495     3.0 2004-08-04
33832161   330974     1600     2.5 2004-08-04

[33832162 rows x 4 columns]), user_features=None, item_features=None)

In [50]:
from rectools.metrics import calc_metrics, Accuracy, NDCG


In [53]:
reco = pd.DataFrame(
    {
        Columns.User: [1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4],
        Columns.Item: [7, 8, 1, 2, 1, 2, 3, 4, 1, 2, 3],
        Columns.Rank: [1, 2, 1, 2, 1, 2, 3, 4, 1, 2, 3],
    }
)
interactions = pd.DataFrame(
    {
        Columns.User: [1, 1, 2, 3, 3, 3, 4, 4, 4],
        Columns.Item: [1, 2, 1, 1, 3, 4, 1, 2, 3],
    }
)


metrics = {
    'ndcg@1': NDCG(k=1),
    'accuracy@1': Accuracy(k=1)
}
calc_metrics(
    metrics,
    reco=reco,
    interactions=interactions,
    prev_interactions=interactions,
    catalog=interactions[Columns.Item].unique()
)


{'accuracy@1': 0.5625, 'ndcg@1': 0.75}