In [83]:
import typing as tp
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from rectools import Columns
from rectools.models.base import ModelBase
from scipy import sparse

def leave_one_out_mask_for_users(
    interactions: pd.DataFrame, val_users: tp.Optional[np.ndarray] = None
) -> np.ndarray:
    rank = (
        interactions.sort_values(Columns.Datetime, ascending=False, kind="stable")
        .groupby(Columns.User, sort=False)
        .cumcount()
    )
    return rank

In [85]:
def interactions():
    df = pd.DataFrame(
        [
            [1, 1, 1, "2021-09-01"],  # 0
            [1, 2, 1, "2021-09-02"],  # 1
            [1, 6, 1, "2021-09-03"],  # 2
            [1, 2, 1, "2021-09-04"],  # 3
            [1, 4, 1, "2021-09-05"],  # 4
            [2, 3, 1, "2021-09-05"],  # 5
            [2, 2, 1, "2021-08-20"],  # 6
            [2, 2, 1, "2021-09-06"],  # 7
            [3, 1, 1, "2021-09-05"],  # 8
            [1, 3, 1, "2021-09-05"],  # 9
        ],
        columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
    ).astype({Columns.Datetime: "datetime64[ns]"})
    return df

In [86]:
df = interactions()
df

Unnamed: 0,user_id,item_id,weight,datetime
0,1,1,1,2021-09-01
1,1,2,1,2021-09-02
2,1,6,1,2021-09-03
3,1,2,1,2021-09-04
4,1,4,1,2021-09-05
5,2,3,1,2021-09-05
6,2,2,1,2021-08-20
7,2,2,1,2021-09-06
8,3,1,1,2021-09-05
9,1,3,1,2021-09-05


In [87]:
rank = leave_one_out_mask_for_users(df)

In [88]:
mask = rank==0
mask

7     True
4     True
5    False
8     True
9    False
3    False
2    False
1    False
0    False
6    False
dtype: bool

In [89]:
df[mask]

  df[mask]


Unnamed: 0,user_id,item_id,weight,datetime
4,1,4,1,2021-09-05
7,2,2,1,2021-09-06
8,3,1,1,2021-09-05


#### Буквально взята функция val_mask из бенчей и она для  1 юзера возвращает интеракцию с индексом 4, а  не 9

In [111]:
df_copy = df.copy()
time_order = (
    df.groupby(Columns.User)[Columns.Datetime]
    .rank(method="first", ascending=True)
    .astype(int)
)
df_copy["order"] = time_order
df_copy

Unnamed: 0,user_id,item_id,weight,datetime,order
0,1,1,1,2021-09-01,1
1,1,2,1,2021-09-02,2
2,1,6,1,2021-09-03,3
3,1,2,1,2021-09-04,4
4,1,4,1,2021-09-05,5
5,2,3,1,2021-09-05,2
6,2,2,1,2021-08-20,1
7,2,2,1,2021-09-06,3
8,3,1,1,2021-09-05,1
9,1,3,1,2021-09-05,6


In [97]:
last_interactions_index = df_copy.groupby([Columns.User])["order"].idxmax()
last_interactions_index

user_id
1    9
2    7
3    8
Name: order, dtype: int64

In [99]:
mask = df_copy.index.isin(last_interactions_index)
mask

array([False, False, False, False, False, False, False,  True,  True,
        True])

In [102]:
df.loc[mask]

Unnamed: 0,user_id,item_id,weight,datetime
7,2,2,1,2021-09-06
8,3,1,1,2021-09-05
9,1,3,1,2021-09-05


In [119]:
def correct_loo_mask(
    interactions: pd.DataFrame, val_users: tp.Optional[np.ndarray] = None
) -> np.ndarray:
    time_order = (
        interactions.groupby(Columns.User)[Columns.Datetime]
        .rank(method="first", ascending=True)
        .astype(int)
    )
    df_copy  = df.copy()
    df_copy["order"] = time_order
    last_interactions_indexes = df_copy.groupby([Columns.User])["order"].idxmax()
    last_interact_mask  = df_copy.index.isin(last_interactions_indexes)
    if  val_users is not None:
        return (interactions[Columns.User].isin(val_users)) & last_interact_mask
    return last_interact_mask

In [120]:
mask = correct_loo_mask(df)

In [121]:
df[mask]

Unnamed: 0,user_id,item_id,weight,datetime
7,2,2,1,2021-09-06
8,3,1,1,2021-09-05
9,1,3,1,2021-09-05
