In [1]:
from datetime import timedelta
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



# First Stage

In [2]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

In [3]:
df_train, df_eval = load_data_actions(df_clickstream, df_event)

In [4]:
import numpy as np

mean_by_cat = (
    df_text_features
    .join(df_cat_features["item", "node"], on="item", how="left")
    .to_pandas()
    .groupby("node")["title_projection"]
    .apply(lambda vs: np.mean(np.stack(vs.values), axis=0).tolist())
    .reset_index(name="mean_title_projection")
)

mean_by_cat

Unnamed: 0,node,mean_title_projection
0,1,"[-128.0, 127.0, 127.0, -40.0, -92.5, -109.5, 3..."
1,2,"[-128.0, 91.0, 127.0, -19.0, -72.0, -128.0, 11..."
2,3,"[-128.0, -30.0, 127.0, -119.0, -117.0, -128.0,..."
3,4,"[-128.0, 122.0, 127.0, -128.0, -86.0, -128.0, ..."
4,5,"[-128.0, 42.0, 127.0, -128.0, -80.0, -128.0, -..."
...,...,...
408469,424063,"[-128.0, -128.0, 127.0, -128.0, -111.0, 1.0, -..."
408470,424064,"[-128.0, -128.0, 127.0, -128.0, -126.0, -128.0..."
408471,424065,"[-105.0, -128.0, 15.0, -28.0, 68.0, -128.0, 63..."
408472,424067,"[-128.0, -128.0, 111.0, -35.0, 50.0, -89.0, -1..."


In [25]:
item_features = pl.DataFrame(mean_by_cat)
item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")
item_features = item_features.rename({"node": "id"})
unmelted_item_features = (
    item_features.unpivot(index="id", on=[x for x in item_features.columns if x != 'id'])
)
unmelted_item_features = unmelted_item_features.rename({"variable": "feature"})
item_features.head()

  item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")


id,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11,field_12,field_13,field_14,field_15,field_16,field_17,field_18,field_19,field_20,field_21,field_22,field_23,field_24,field_25,field_26,field_27,field_28,field_29,field_30,field_31,field_32,field_33,field_34,field_35,field_36,field_37,field_38,field_39,field_40,field_41,field_42,field_43,field_44,field_45,field_46,field_47,field_48,field_49,field_50,field_51,field_52,field_53,field_54,field_55,field_56,field_57,field_58,field_59,field_60,field_61,field_62,field_63
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,90.0,33.0,7.5,-72.5,-49.5,111.0,27.0,23.5,-98.5,98.5,-26.0,-94.0,-11.5,-57.5,127.0,-34.0,5.5,-103.0,-30.0,75.5,38.0,3.5,17.5,16.5,-19.5,-40.0,9.0,127.0,16.0,-85.5,17.0,97.5,-107.5,45.0,-65.0,3.0,28.5,-122.0,-77.5,-5.0,-89.0,44.0,-60.0,74.0,-63.0,50.0,-125.5,55.0,-23.0,-41.0,5.0,114.5,122.0,66.0,23.5
2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,127.0,9.0,21.0,-128.0,-123.0,127.0,-3.0,9.0,-86.0,78.0,-79.0,-89.0,-59.0,-8.0,127.0,17.0,-51.0,-128.0,-108.0,127.0,-33.0,-23.0,24.0,-31.0,21.0,-123.0,-97.0,127.0,66.0,-128.0,101.0,127.0,-109.0,43.0,-68.0,30.0,82.0,-128.0,-28.0,-93.0,-53.0,44.0,-20.0,84.0,-45.0,97.0,-128.0,116.0,-4.0,-74.0,60.0,127.0,127.0,18.0,-5.0
3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,100.0,81.0,117.0,-15.0,-128.0,112.0,9.0,36.0,-128.0,127.0,-128.0,-97.0,80.0,-38.0,127.0,120.0,38.0,-128.0,-74.0,124.0,-40.0,59.0,-25.0,-39.0,66.0,-128.0,-114.0,113.0,34.0,-128.0,29.0,127.0,-128.0,-37.0,34.0,76.0,43.0,-106.0,-59.0,71.0,-10.0,48.0,41.0,89.0,-6.0,69.0,-56.0,127.0,-89.0,-97.0,7.0,92.0,25.0,-14.0,36.0
4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,-13.0,112.0,-21.0,77.0,-54.0,75.0,115.0,39.0,-61.0,36.0,-98.0,-25.0,87.0,7.0,127.0,-6.0,50.0,-128.0,-128.0,104.0,-128.0,-43.0,-96.0,-43.0,44.0,-128.0,-128.0,13.0,127.0,-128.0,6.0,127.0,-128.0,-69.0,-15.0,48.0,127.0,-128.0,-18.0,115.0,23.0,123.0,-45.0,127.0,-36.0,127.0,15.0,127.0,-34.0,-95.0,36.0,117.0,41.0,49.0,9.0
5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,-17.0,127.0,-65.0,120.0,-68.0,52.0,79.0,52.0,-76.0,59.0,-114.0,-64.0,82.0,73.0,127.0,-3.0,85.0,-128.0,-105.0,26.0,-128.0,-35.0,45.0,35.0,31.0,-65.0,-107.0,-13.0,127.0,-128.0,-1.0,127.0,-45.0,-17.0,0.0,24.0,127.0,-76.0,-16.0,127.0,-36.0,66.0,-48.0,127.0,-47.0,106.0,-14.0,127.0,-63.0,48.0,32.0,81.0,-15.0,80.0,-31.0


In [6]:
def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()
df_train = dataframe2rectools(df_train)

In [7]:
dataset = RTDataset.construct(
    interactions_df=df_train,
    user_features_df=None,
    cat_user_features=None,
#     item_features_df=unmelted_item_features.to_pandas(),
#     cat_item_features=["category"],
)

In [8]:
%%time
from implicit.als import AlternatingLeastSquares
model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=256,  # latent embeddings size
        regularization=0.1,
        iterations=30,
        alpha=1,
        random_state=SEED,
    ),
    fit_features_together=False,  # way to fit paired features
    verbose=True,
)
model.fit(dataset)
df_pred = model.recommend(
    users=list(df_eval["cookie"].unique()),
    dataset=dataset,
    k=top_N,
    filter_viewed=True,
)
df_pred = pl.DataFrame(
    df_pred[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})


  0%|          | 0/30 [00:00<?, ?it/s]

CPU times: user 10min 10s, sys: 4.42 s, total: 10min 15s
Wall time: 10min 17s


In [9]:
from tools import recall_at
recall_at(df_eval, df_pred, k=40)

0.14485603180719295

In [10]:
candidates = model.recommend(
    users=list(df_eval["cookie"].unique()),
    dataset=dataset,
    k=200,
    filter_viewed=True,
)
candidates = pl.DataFrame(
    candidates[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})

In [11]:
candidates["cookie","node"].write_csv('data/first_stage_candidates_ALS_200.csv')

In [12]:
%%time
dataset = RTDataset.construct(
    interactions_df=dataframe2rectools(df_clickstream[["cookie", "node", "event_date"]]),
    user_features_df=None,
    cat_user_features=None,
#     item_features_df=unmelted_item_features.to_pandas(),
#     cat_item_features=["category"],
)

from implicit.als import AlternatingLeastSquares
model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=256,  # latent embeddings size
        regularization=0.1,
        iterations=30,
        alpha=1,
        random_state=SEED,
    ),
    fit_features_together=False,  # way to fit paired features
    verbose=True,
)
model.fit(dataset)
candidates = model.recommend(
    users=list(df_test_users["cookie"].unique()),
    dataset=dataset,
    k=200,
    filter_viewed=True,  
    on_unsupported_targets="ignore" 
)
candidates = pl.DataFrame(
    candidates[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})

  0%|          | 0/30 [00:00<?, ?it/s]

CPU times: user 12min 36s, sys: 8.73 s, total: 12min 45s
Wall time: 12min 48s


In [13]:
candidates["cookie","node"].write_csv('data/test_hybrid_stage_candidates_ALS_200.csv')

In [14]:
candidates["cookie"].unique()

cookie
i64
1
4
7
10
13
…
149995
149996
149997
149998


# Second stage

In [1]:
from datetime import timedelta
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



In [2]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

In [3]:
df_train, df_eval = load_data_actions(df_clickstream, df_event)

In [4]:
import numpy as np

mean_by_cat = (
    df_text_features
    .join(df_cat_features["item", "node"], on="item", how="left")
    .to_pandas()
    .groupby("node")["title_projection"]
    .apply(lambda vs: np.mean(np.stack(vs.values), axis=0).tolist())
    .reset_index(name="mean_title_projection")
)

mean_by_cat

Unnamed: 0,node,mean_title_projection
0,1,"[-128.0, 127.0, 127.0, -40.0, -92.5, -109.5, 3..."
1,2,"[-128.0, 91.0, 127.0, -19.0, -72.0, -128.0, 11..."
2,3,"[-128.0, -30.0, 127.0, -119.0, -117.0, -128.0,..."
3,4,"[-128.0, 122.0, 127.0, -128.0, -86.0, -128.0, ..."
4,5,"[-128.0, 42.0, 127.0, -128.0, -80.0, -128.0, -..."
...,...,...
408469,424063,"[-128.0, -128.0, 127.0, -128.0, -111.0, 1.0, -..."
408470,424064,"[-128.0, -128.0, 127.0, -128.0, -126.0, -128.0..."
408471,424065,"[-105.0, -128.0, 15.0, -28.0, 68.0, -128.0, 63..."
408472,424067,"[-128.0, -128.0, 111.0, -35.0, 50.0, -89.0, -1..."


In [5]:
addit_features = pl.read_parquet(DATA_DIR+"cat_features_preproc_20.pq")
atr_cols = [i for i in addit_features.columns if "attr" in i]
addit_features = (
    addit_features.group_by("node")
    .agg([pl.col(atr).mean() for atr in atr_cols])
)

In [6]:
item_features = pl.DataFrame(mean_by_cat)
item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")

item_features = item_features.join(addit_features, how="left", on="node")

item_features = item_features.rename({"node": "id"})
unmelted_item_features = (
    item_features.unpivot(index="id", on=[x for x in item_features.columns if x != 'id'])
)
unmelted_item_features = unmelted_item_features.rename({"variable": "feature"})
item_features = item_features.rename({"id": "node"})
item_features.head()

  item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")


node,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11,field_12,field_13,field_14,field_15,field_16,field_17,field_18,field_19,field_20,field_21,field_22,field_23,field_24,field_25,field_26,field_27,field_28,field_29,field_30,field_31,field_32,field_33,field_34,field_35,…,field_47,field_48,field_49,field_50,field_51,field_52,field_53,field_54,field_55,field_56,field_57,field_58,field_59,field_60,field_61,field_62,field_63,attr_704,attr_2278,attr_1433,attr_1897,attr_1550,attr_567,attr_3031,attr_3392,attr_3025,attr_4622,attr_3911,attr_2140,attr_1971,attr_1092,attr_3509,attr_124,attr_3154,attr_1598,attr_3939,attr_3640
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,90.0,33.0,7.5,-72.5,-49.5,111.0,27.0,23.5,-98.5,98.5,-26.0,-94.0,-11.5,-57.5,127.0,-34.0,5.5,-103.0,-30.0,75.5,38.0,3.5,17.5,16.5,-19.5,-40.0,9.0,…,-77.5,-5.0,-89.0,44.0,-60.0,74.0,-63.0,50.0,-125.5,55.0,-23.0,-41.0,5.0,114.5,122.0,66.0,23.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,127.0,9.0,21.0,-128.0,-123.0,127.0,-3.0,9.0,-86.0,78.0,-79.0,-89.0,-59.0,-8.0,127.0,17.0,-51.0,-128.0,-108.0,127.0,-33.0,-23.0,24.0,-31.0,21.0,-123.0,-97.0,…,-28.0,-93.0,-53.0,44.0,-20.0,84.0,-45.0,97.0,-128.0,116.0,-4.0,-74.0,60.0,127.0,127.0,18.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,100.0,81.0,117.0,-15.0,-128.0,112.0,9.0,36.0,-128.0,127.0,-128.0,-97.0,80.0,-38.0,127.0,120.0,38.0,-128.0,-74.0,124.0,-40.0,59.0,-25.0,-39.0,66.0,-128.0,-114.0,…,-59.0,71.0,-10.0,48.0,41.0,89.0,-6.0,69.0,-56.0,127.0,-89.0,-97.0,7.0,92.0,25.0,-14.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,-13.0,112.0,-21.0,77.0,-54.0,75.0,115.0,39.0,-61.0,36.0,-98.0,-25.0,87.0,7.0,127.0,-6.0,50.0,-128.0,-128.0,104.0,-128.0,-43.0,-96.0,-43.0,44.0,-128.0,-128.0,…,-18.0,115.0,23.0,123.0,-45.0,127.0,-36.0,127.0,15.0,127.0,-34.0,-95.0,36.0,117.0,41.0,49.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,-17.0,127.0,-65.0,120.0,-68.0,52.0,79.0,52.0,-76.0,59.0,-114.0,-64.0,82.0,73.0,127.0,-3.0,85.0,-128.0,-105.0,26.0,-128.0,-35.0,45.0,35.0,31.0,-65.0,-107.0,…,-16.0,127.0,-36.0,66.0,-48.0,127.0,-47.0,106.0,-14.0,127.0,-63.0,48.0,32.0,81.0,-15.0,80.0,-31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_cat_features

item,location,category,node,attr_704,attr_2278,attr_1433,attr_1897,attr_1550,attr_567,attr_3031,attr_3392,attr_3025,attr_4622,attr_3911,attr_2140,attr_1971,attr_1092,attr_3509,attr_124,attr_3154,attr_1598,attr_3939,attr_3640
i64,i64,i64,u32,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
9,8385,57,194747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
17,2707,35,352905,270613,533341,0,582135,498267,173798,0,0,0,0,725581,501466,0,0,0,0,0,0,0,0
144,8383,8,17188,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
202,5397,57,194766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
236,2105,64,153951,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
28804461,24,35,326792,249640,737256,498743,582135,498267,318315,0,0,0,0,725581,364348,0,0,0,0,0,0,0,0
28804502,2305,51,401208,0,0,0,0,0,0,0,488527,770753,171723,0,0,0,0,0,0,0,247180,0,0
28804563,2348,0,13974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
28804609,2348,51,258971,0,0,0,0,0,0,0,32665,770753,618809,0,0,457655,0,0,0,0,0,486468,0


In [41]:
df_clickstream

cookie,item,event,event_date,platform,surface,node
i64,i64,i64,datetime[ns],i64,i64,u32
0,19915558,17,2025-02-05 02:30:59,3,2,115659
0,2680232,17,2025-01-24 21:16:57,3,2,115829
1,4247649,17,2025-01-29 23:00:58,2,2,7
1,4247649,17,2025-02-17 14:55:17,2,2,7
1,2171135,17,2025-01-17 19:23:29,2,2,214458
…,…,…,…,…,…,…
149999,4999183,17,2025-01-20 12:23:47,2,2,71511
149999,25999164,17,2025-01-24 14:26:57,2,2,71514
149999,12138732,17,2025-02-12 13:11:42,2,2,51162
149999,28207042,17,2025-02-16 12:35:35,2,2,71511


In [7]:
def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()
df_train = dataframe2rectools(df_train)

In [8]:
candidates = pl.read_csv('data/first_stage_candidates_ALS_200.csv')

In [9]:
candidates = candidates.to_pandas()
candidates['rank'] = candidates.groupby('cookie').cumcount() + 1 
df_eval = df_eval.to_pandas()
df_eval = df_eval.astype({"node": int})

In [10]:
candidates[candidates.cookie == 1]

Unnamed: 0,cookie,node,rank
200,1,151478,1
201,1,230746,2
202,1,239954,3
203,1,153004,4
204,1,229316,5
...,...,...,...
395,1,236421,196
396,1,214312,197
397,1,233339,198
398,1,255629,199


In [11]:
df_eval[df_eval.cookie == 1].node.isin(candidates[candidates.cookie == 1])

89440     False
111137    False
140543    False
Name: node, dtype: bool

In [12]:
# positive interactions
pos = candidates.merge(
    df_eval,
    on=['cookie', 'node'],
    how='inner'
    )

pos.loc[:, 'target'] = 1
print(pos.shape)
pos.head()

(44339, 5)


Unnamed: 0,cookie,node,rank,event,target
0,0,130589,103,19,1
1,2,151577,1,19,1
2,3,214377,2,5,1
3,19,196680,20,4,1
4,22,48366,177,5,1


In [13]:
# negative interactions
neg = candidates.set_index(['cookie', 'node'])\
        .join(df_eval.set_index(['cookie', 'node']))

neg = neg[neg['event'].isnull()].reset_index()     
neg = neg.sample(frac=0.03)
neg.loc[:, 'target'] = 0

neg.shape
neg.head()

Unnamed: 0,cookie,node,rank,event,target
424900,5900,188666,8,,0
1831246,25229,10893,68,,0
9692741,132541,152022,147,,0
2007325,27602,159207,64,,0
7526593,102933,151455,11,,0


In [14]:
# train test
ctb_train_users, ctb_test_users = train_test_split(
    df_eval['cookie'].unique(),
    random_state=SEED,
    test_size=0.2
    )
# train eval
ctb_train_users, ctb_eval_users = train_test_split(
    ctb_train_users,
    random_state=SEED,
    test_size=0.1
    )

In [15]:
select_col = ["cookie", "node", "rank", 'target']

# Catboost train
ctb_train = shuffle(
    pd.concat([
        pos[pos['cookie'].isin(ctb_train_users)],
        neg[neg['cookie'].isin(ctb_train_users)]
])[select_col]
)

# Catboost test
ctb_test = shuffle(
    pd.concat([
        pos[pos['cookie'].isin(ctb_test_users)],
        neg[neg['cookie'].isin(ctb_test_users)]
])[select_col]
)

# for early stopping
ctb_eval = shuffle(
    pd.concat([
        pos[pos['cookie'].isin(ctb_eval_users)],
        neg[neg['cookie'].isin(ctb_eval_users)]
])[select_col]
)

In [16]:
ctb_train['target'].value_counts(normalize=True)

target
0    0.881192
1    0.118808
Name: proportion, dtype: float64

In [17]:
ctb_test['target'].value_counts(normalize=True)

target
0    0.881045
1    0.118955
Name: proportion, dtype: float64

In [18]:
item_features = item_features.to_pandas()

In [19]:
user_col = ['cookie']
item_col = item_features.columns

In [20]:
item_features

Unnamed: 0,node,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,...,attr_3911,attr_2140,attr_1971,attr_1092,attr_3509,attr_124,attr_3154,attr_1598,attr_3939,attr_3640
0,1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408469,424063,-128.0,-128.0,127.0,-128.0,-111.0,1.0,-36.0,98.0,103.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247180.0,0.0,0.0
408470,424064,-128.0,-128.0,127.0,-128.0,-126.0,-128.0,47.0,127.0,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247180.0,0.0,0.0
408471,424065,-105.0,-128.0,15.0,-28.0,68.0,-128.0,63.0,123.0,-3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247180.0,0.0,0.0
408472,424067,-128.0,-128.0,111.0,-35.0,50.0,-89.0,-11.0,127.0,123.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247180.0,0.0,0.0


In [21]:
train_feat = (
    ctb_train
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left')
)
eval_feat = (
    ctb_eval
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left')
)
test_feat = (
    ctb_test
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left'
        )
)

In [22]:
train_feat.isna().sum().sum(), eval_feat.isna().sum().sum()

(np.int64(0), np.int64(0))

In [23]:
drop_col = ['cookie', 'node']
target_col = ['target']
cat_col = [f"field_{i}" for i in range(64)]

In [24]:
X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]
X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((268450, 85), (268450, 1), (29941, 85), (29941, 1), (75104, 85), (75104,))

In [25]:
from catboost import CatBoostClassifier, CatBoostRanker

# параметры для обучения
est_params = {
  'subsample': 0.9,
  'max_depth': 4,
  'n_estimators': 5000,
  'learning_rate': 0.03,
  'thread_count': 20,
  'random_state': SEED,
  'verbose': 200,
#   "loss_function":'YetiRankPairwise',
#  "eval_metric":'NDCG'
}

ctb_model = CatBoostClassifier(**est_params)

In [26]:
ctb_model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=100,
    plot=True
    )

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6684745	test: 0.6684920	best: 0.6684920 (0)	total: 73.4ms	remaining: 6m 7s
200:	learn: 0.3294011	test: 0.3290590	best: 0.3290590 (200)	total: 2.38s	remaining: 56.9s
400:	learn: 0.3264658	test: 0.3267844	best: 0.3267844 (400)	total: 4.72s	remaining: 54.1s
600:	learn: 0.3243602	test: 0.3254592	best: 0.3254592 (600)	total: 7.23s	remaining: 52.9s
800:	learn: 0.3228270	test: 0.3246175	best: 0.3246169 (799)	total: 9.74s	remaining: 51s
1000:	learn: 0.3216285	test: 0.3241129	best: 0.3241129 (1000)	total: 12.5s	remaining: 49.9s
1200:	learn: 0.3206042	test: 0.3238102	best: 0.3238102 (1200)	total: 14.6s	remaining: 46.3s
1400:	learn: 0.3196920	test: 0.3236313	best: 0.3236313 (1400)	total: 16.9s	remaining: 43.3s
1600:	learn: 0.3188973	test: 0.3235026	best: 0.3234992 (1577)	total: 19s	remaining: 40.4s
1800:	learn: 0.3181737	test: 0.3234599	best: 0.3234519 (1790)	total: 21.4s	remaining: 38.1s
2000:	learn: 0.3174990	test: 0.3233994	best: 0.3233930 (1991)	total: 23.6s	remaining: 35.4s
2200:

<catboost.core.CatBoostClassifier at 0x792935910>

In [None]:
# 0.3257109057

In [27]:
# # save model 
import dill 
with open(f"ctb_model_baseline.dill", 'wb') as f:
    dill.dump(ctb_model, f)

In [28]:
y_pred = ctb_model.predict_proba(X_test)

In [29]:
from sklearn.metrics import roc_auc_score

f"ROC AUC score = {roc_auc_score(y_test, y_pred[:, 1]):.2f}"

'ROC AUC score = 0.73'

In [31]:
ctb_test

Unnamed: 0,cookie,node,rank,target
3252067,44547,214298,167,0
135566,1969,264372,93,0
4543376,61974,161681,84,0
1011340,13851,153038,167,0
7209435,98605,163834,39,0
...,...,...,...,...
3239630,44389,230739,85,0
131729,1902,5746,41,0
6008601,82211,152472,141,0
6214593,84955,188737,161,0


In [13]:
item_features = item_features.to_pandas()

In [19]:
np.arange(0.001, 0.2, 0.02)

array([0.001, 0.021, 0.041, 0.061, 0.081, 0.101, 0.121, 0.141, 0.161,
       0.181])

In [21]:
# for frac in np.arange(0.001, 0.2, 0.02):
#     # negative interactions
#     neg = candidates.set_index(['cookie', 'node'])\
#             .join(df_eval.set_index(['cookie', 'node']))

#     neg = neg[neg['event'].isnull()].reset_index()     
#     neg = neg.sample(frac=frac)
#     neg.loc[:, 'target'] = 0

#     print("neg.shape", neg.shape)
#     print("neg.head()", neg.head())
#     # train test
#     ctb_train_users, ctb_test_users = train_test_split(
#         df_eval['cookie'].unique(),
#         random_state=SEED,
#         test_size=0.2
#         )
#     # train eval
#     ctb_train_users, ctb_eval_users = train_test_split(
#         ctb_train_users,
#         random_state=SEED,
#         test_size=0.1
#         )
#     select_col = ["cookie", "node", "rank", 'target']

#     # Catboost train
#     ctb_train = shuffle(
#         pd.concat([
#             pos[pos['cookie'].isin(ctb_train_users)],
#             neg[neg['cookie'].isin(ctb_train_users)]
#     ])[select_col]
#     )

#     # Catboost test
#     ctb_test = shuffle(
#         pd.concat([
#             pos[pos['cookie'].isin(ctb_test_users)],
#             neg[neg['cookie'].isin(ctb_test_users)]
#     ])[select_col]
#     )

#     # for early stopping
#     ctb_eval = shuffle(
#         pd.concat([
#             pos[pos['cookie'].isin(ctb_eval_users)],
#             neg[neg['cookie'].isin(ctb_eval_users)]
#     ])[select_col]
#     )
#     print(ctb_train['target'].value_counts(normalize=True))
#     print(ctb_test['target'].value_counts(normalize=True))
    
#     user_col = ['cookie']
#     item_col = item_features.columns
#     train_feat = (
#         ctb_train
#         # .merge(
#         #     users[user_col],
#         #     on=['user_id'],
#         #     how='left')
#         .merge(
#             item_features[item_col],
#             on=['node'],
#             how='left')
#     )
#     eval_feat = (
#         ctb_eval
#         # .merge(
#         #     users[user_col],
#         #     on=['user_id'],
#         #     how='left')
#         .merge(
#             item_features[item_col],
#             on=['node'],
#             how='left')
#     )
#     test_feat = (
#         ctb_test
#         # .merge(
#         #     users[user_col],
#         #     on=['user_id'],
#         #     how='left')
#         .merge(
#             item_features[item_col],
#             on=['node'],
#             how='left'
#             )
#     )
#     print(train_feat.isna().sum().sum(), eval_feat.isna().sum().sum())
#     drop_col = ['cookie', 'node']
#     target_col = ['target']
#     cat_col = [f"field_{i}" for i in range(64)]
#     X_train, y_train = train_feat.drop(drop_col + target_col, axis=1), train_feat[target_col]
#     X_val, y_val = eval_feat.drop(drop_col + target_col, axis=1), eval_feat[target_col]
#     X_test, y_test = test_feat.drop(drop_col + target_col, axis=1), test_feat['target']
#     X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape
#     from catboost import CatBoostClassifier, CatBoostRanker

#     # параметры для обучения
#     est_params = {
#         'subsample': 0.9,
#         'max_depth': 6,
#         'n_estimators': 5000,
#         'learning_rate': 0.01,
#         'thread_count': 20,
#         'random_state': SEED,
#         'verbose': 200,
#         #   "loss_function":'YetiRankPairwise',
#         #  "eval_metric":'NDCG'
#     }

#     ctb_model = CatBoostClassifier(**est_params)
#     ctb_model.fit(
#         X_train,
#         y_train,
#         eval_set=(X_val, y_val),
#         early_stopping_rounds=100,
#         plot=True
#         )
#     # save model 
#     # import dill 
#     # with open(f"ctb_model_baseline.dill", 'wb') as f:
#     #     dill.dump(ctb_model, f)
#     y_pred = ctb_model.predict_proba(X_test)
#     from sklearn.metrics import roc_auc_score
    
#     print(f"{frac}")
#     print(f"ROC AUC score = {roc_auc_score(y_test, y_pred[:, 1]):.4f}\n")

neg.shape (10972, 5)
neg.head()          cookie    node  rank  event  target
9424240  128905  272423   127    NaN       0
9148630  125141  152024   141    NaN       0
3237112   44342  214244   155    NaN       0
6851230   93745  153938   172    NaN       0
2628452   36097  214395   101    NaN       0
target
1    0.800823
0    0.199177
Name: proportion, dtype: float64
target
1    0.805751
0    0.194249
Name: proportion, dtype: float64
0 0


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6882433	test: 0.6882561	best: 0.6882561 (0)	total: 5.81ms	remaining: 29s
200:	learn: 0.4596158	test: 0.4620266	best: 0.4620266 (200)	total: 1.12s	remaining: 26.7s
400:	learn: 0.4468319	test: 0.4528083	best: 0.4528083 (400)	total: 2.2s	remaining: 25.2s
600:	learn: 0.4404897	test: 0.4501887	best: 0.4501887 (600)	total: 3.13s	remaining: 22.9s
800:	learn: 0.4355458	test: 0.4487529	best: 0.4487460 (798)	total: 4.03s	remaining: 21.1s
1000:	learn: 0.4312209	test: 0.4479636	best: 0.4479636 (1000)	total: 4.92s	remaining: 19.6s
1200:	learn: 0.4268106	test: 0.4477448	best: 0.4477213 (1195)	total: 5.86s	remaining: 18.5s
1400:	learn: 0.4224046	test: 0.4476950	best: 0.4476009 (1324)	total: 6.83s	remaining: 17.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.447600891
bestIteration = 1324

Shrink model to first 1325 iterations.
0.001
ROC AUC score = 0.7237

neg.shape (230409, 5)
neg.head()          cookie    node  rank  event  target
4632079   63129  120532   119   

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6866235	test: 0.6866048	best: 0.6866048 (0)	total: 10.5ms	remaining: 52.5s
200:	learn: 0.4053794	test: 0.4004656	best: 0.4004656 (200)	total: 2.32s	remaining: 55.4s
400:	learn: 0.3967021	test: 0.3915913	best: 0.3915913 (400)	total: 4.52s	remaining: 51.8s
600:	learn: 0.3936695	test: 0.3893444	best: 0.3893444 (600)	total: 6.71s	remaining: 49.1s
800:	learn: 0.3917261	test: 0.3881902	best: 0.3881902 (800)	total: 8.97s	remaining: 47.1s
1000:	learn: 0.3901760	test: 0.3873902	best: 0.3873902 (1000)	total: 11.1s	remaining: 44.3s
1200:	learn: 0.3888377	test: 0.3868131	best: 0.3868131 (1200)	total: 13.2s	remaining: 41.7s
1400:	learn: 0.3875372	test: 0.3863953	best: 0.3863953 (1400)	total: 15.4s	remaining: 39.5s
1600:	learn: 0.3862840	test: 0.3859989	best: 0.3859984 (1599)	total: 17.7s	remaining: 37.5s
1800:	learn: 0.3851364	test: 0.3857451	best: 0.3857449 (1799)	total: 20.1s	remaining: 35.7s
2000:	learn: 0.3841086	test: 0.3855279	best: 0.3855274 (1998)	total: 22.3s	remaining: 33.4s
2

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6829268	test: 0.6829219	best: 0.6829219 (0)	total: 16.9ms	remaining: 1m 24s
200:	learn: 0.2820836	test: 0.2781950	best: 0.2781950 (200)	total: 3.41s	remaining: 1m 21s
400:	learn: 0.2743273	test: 0.2698920	best: 0.2698920 (400)	total: 6.88s	remaining: 1m 18s
600:	learn: 0.2723786	test: 0.2681731	best: 0.2681731 (600)	total: 10.4s	remaining: 1m 15s
800:	learn: 0.2711432	test: 0.2673116	best: 0.2673116 (800)	total: 14s	remaining: 1m 13s
1000:	learn: 0.2701531	test: 0.2667118	best: 0.2667118 (1000)	total: 17.6s	remaining: 1m 10s
1200:	learn: 0.2693799	test: 0.2662784	best: 0.2662784 (1200)	total: 20.9s	remaining: 1m 6s
1400:	learn: 0.2686809	test: 0.2659753	best: 0.2659753 (1400)	total: 24.2s	remaining: 1m 2s
1600:	learn: 0.2680072	test: 0.2657614	best: 0.2657614 (1600)	total: 28s	remaining: 59.4s
1800:	learn: 0.2673775	test: 0.2655839	best: 0.2655839 (1800)	total: 32s	remaining: 56.9s
2000:	learn: 0.2667848	test: 0.2654497	best: 0.2654497 (2000)	total: 36.4s	remaining: 54.5s
2

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6805787	test: 0.6805869	best: 0.6805869 (0)	total: 24.4ms	remaining: 2m 1s
200:	learn: 0.2198654	test: 0.2167598	best: 0.2167598 (200)	total: 5.03s	remaining: 2m
400:	learn: 0.2129667	test: 0.2094122	best: 0.2094122 (400)	total: 9.87s	remaining: 1m 53s
600:	learn: 0.2115405	test: 0.2082083	best: 0.2082083 (600)	total: 14.6s	remaining: 1m 47s
800:	learn: 0.2106280	test: 0.2075942	best: 0.2075942 (800)	total: 19.4s	remaining: 1m 41s
1000:	learn: 0.2099399	test: 0.2072057	best: 0.2072057 (1000)	total: 24.5s	remaining: 1m 38s
1200:	learn: 0.2093522	test: 0.2069201	best: 0.2069201 (1200)	total: 29.3s	remaining: 1m 32s
1400:	learn: 0.2088589	test: 0.2067383	best: 0.2067383 (1400)	total: 34s	remaining: 1m 27s
1600:	learn: 0.2083926	test: 0.2065947	best: 0.2065947 (1600)	total: 38.7s	remaining: 1m 22s
1800:	learn: 0.2079410	test: 0.2065071	best: 0.2065055 (1796)	total: 43.4s	remaining: 1m 17s
2000:	learn: 0.2075419	test: 0.2064322	best: 0.2064322 (2000)	total: 48s	remaining: 1m 11s

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6789158	test: 0.6789281	best: 0.6789281 (0)	total: 32ms	remaining: 2m 39s
200:	learn: 0.1817272	test: 0.1798448	best: 0.1798448 (200)	total: 6.33s	remaining: 2m 31s
400:	learn: 0.1755409	test: 0.1733324	best: 0.1733324 (400)	total: 12.4s	remaining: 2m 21s
600:	learn: 0.1744049	test: 0.1723538	best: 0.1723538 (600)	total: 18.4s	remaining: 2m 14s
800:	learn: 0.1736864	test: 0.1718793	best: 0.1718793 (800)	total: 24.4s	remaining: 2m 7s
1000:	learn: 0.1731659	test: 0.1715919	best: 0.1715919 (1000)	total: 30.3s	remaining: 2m
1200:	learn: 0.1727332	test: 0.1714005	best: 0.1714005 (1200)	total: 36.2s	remaining: 1m 54s
1400:	learn: 0.1723541	test: 0.1712548	best: 0.1712548 (1400)	total: 42.2s	remaining: 1m 48s
1600:	learn: 0.1720309	test: 0.1711699	best: 0.1711699 (1600)	total: 48.1s	remaining: 1m 42s
1800:	learn: 0.1717147	test: 0.1710881	best: 0.1710879 (1799)	total: 53.9s	remaining: 1m 35s
2000:	learn: 0.1714224	test: 0.1710261	best: 0.1710261 (2000)	total: 59.8s	remaining: 1m 2

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6776520	test: 0.6776685	best: 0.6776685 (0)	total: 34.8ms	remaining: 2m 54s
200:	learn: 0.1560901	test: 0.1538480	best: 0.1538480 (200)	total: 7.3s	remaining: 2m 54s
400:	learn: 0.1504340	test: 0.1478112	best: 0.1478112 (400)	total: 14.4s	remaining: 2m 45s
600:	learn: 0.1494940	test: 0.1470003	best: 0.1470003 (600)	total: 21.9s	remaining: 2m 40s
800:	learn: 0.1488911	test: 0.1466212	best: 0.1466212 (800)	total: 29.1s	remaining: 2m 32s
1000:	learn: 0.1484622	test: 0.1463936	best: 0.1463936 (1000)	total: 36.3s	remaining: 2m 25s
1200:	learn: 0.1481091	test: 0.1462291	best: 0.1462285 (1199)	total: 43.6s	remaining: 2m 17s
1400:	learn: 0.1478135	test: 0.1461191	best: 0.1461191 (1400)	total: 50.6s	remaining: 2m 10s
1600:	learn: 0.1475463	test: 0.1460428	best: 0.1460428 (1600)	total: 57.6s	remaining: 2m 2s
1800:	learn: 0.1472875	test: 0.1459822	best: 0.1459822 (1800)	total: 1m 4s	remaining: 1m 55s
2000:	learn: 0.1470502	test: 0.1459499	best: 0.1459499 (2000)	total: 1m 11s	remaining

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6766227	test: 0.6766400	best: 0.6766400 (0)	total: 45.2ms	remaining: 3m 46s
200:	learn: 0.1373498	test: 0.1353791	best: 0.1353791 (200)	total: 8.64s	remaining: 3m 26s
400:	learn: 0.1320431	test: 0.1296735	best: 0.1296735 (400)	total: 17.1s	remaining: 3m 16s
600:	learn: 0.1312179	test: 0.1289287	best: 0.1289287 (600)	total: 25.8s	remaining: 3m 8s
800:	learn: 0.1307140	test: 0.1285931	best: 0.1285931 (800)	total: 34.2s	remaining: 2m 59s
1000:	learn: 0.1303507	test: 0.1283849	best: 0.1283849 (1000)	total: 42.6s	remaining: 2m 50s
1200:	learn: 0.1300487	test: 0.1282304	best: 0.1282304 (1200)	total: 51s	remaining: 2m 41s
1400:	learn: 0.1298051	test: 0.1281340	best: 0.1281339 (1399)	total: 59.2s	remaining: 2m 32s
1600:	learn: 0.1295840	test: 0.1280657	best: 0.1280657 (1600)	total: 1m 7s	remaining: 2m 23s
1800:	learn: 0.1293842	test: 0.1280017	best: 0.1280017 (1800)	total: 1m 15s	remaining: 2m 14s
2000:	learn: 0.1291959	test: 0.1279554	best: 0.1279553 (1999)	total: 1m 23s	remaining

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6763514	test: 0.6763693	best: 0.6763693 (0)	total: 52ms	remaining: 4m 20s
200:	learn: 0.1228768	test: 0.1211356	best: 0.1211356 (200)	total: 9.66s	remaining: 3m 50s
400:	learn: 0.1179750	test: 0.1159066	best: 0.1159066 (400)	total: 19.2s	remaining: 3m 39s
600:	learn: 0.1172642	test: 0.1152530	best: 0.1152530 (600)	total: 28.9s	remaining: 3m 31s
800:	learn: 0.1168250	test: 0.1149576	best: 0.1149576 (800)	total: 38.5s	remaining: 3m 21s
1000:	learn: 0.1165164	test: 0.1147698	best: 0.1147698 (1000)	total: 48s	remaining: 3m 11s
1200:	learn: 0.1162676	test: 0.1146464	best: 0.1146464 (1200)	total: 57.7s	remaining: 3m 2s
1400:	learn: 0.1160417	test: 0.1145495	best: 0.1145495 (1400)	total: 1m 7s	remaining: 2m 52s
1600:	learn: 0.1158545	test: 0.1144903	best: 0.1144903 (1600)	total: 1m 16s	remaining: 2m 42s
1800:	learn: 0.1156774	test: 0.1144433	best: 0.1144430 (1797)	total: 1m 26s	remaining: 2m 33s
2000:	learn: 0.1155134	test: 0.1144091	best: 0.1144091 (2000)	total: 1m 36s	remaining:

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6756265	test: 0.6756562	best: 0.6756562 (0)	total: 61.9ms	remaining: 5m 9s
200:	learn: 0.1114254	test: 0.1102026	best: 0.1102026 (200)	total: 11.5s	remaining: 4m 33s
400:	learn: 0.1068392	test: 0.1053043	best: 0.1053043 (400)	total: 22.3s	remaining: 4m 15s
600:	learn: 0.1061873	test: 0.1047007	best: 0.1047007 (600)	total: 33.3s	remaining: 4m 3s
800:	learn: 0.1058145	test: 0.1044339	best: 0.1044339 (800)	total: 44.3s	remaining: 3m 52s
1000:	learn: 0.1055348	test: 0.1042561	best: 0.1042561 (1000)	total: 55.2s	remaining: 3m 40s
1200:	learn: 0.1053129	test: 0.1041386	best: 0.1041386 (1200)	total: 1m 6s	remaining: 3m 29s
1400:	learn: 0.1051229	test: 0.1040549	best: 0.1040549 (1400)	total: 1m 16s	remaining: 3m 17s
1600:	learn: 0.1049500	test: 0.1039863	best: 0.1039863 (1600)	total: 1m 27s	remaining: 3m 6s
1800:	learn: 0.1047925	test: 0.1039405	best: 0.1039405 (1800)	total: 1m 38s	remaining: 2m 54s
2000:	learn: 0.1046489	test: 0.1039023	best: 0.1039023 (2000)	total: 1m 48s	remaini

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6749820	test: 0.6750004	best: 0.6750004 (0)	total: 64.8ms	remaining: 5m 23s
200:	learn: 0.1021167	test: 0.1004930	best: 0.1004930 (200)	total: 12.5s	remaining: 4m 58s
400:	learn: 0.0977630	test: 0.0958353	best: 0.0958353 (400)	total: 24.8s	remaining: 4m 44s
600:	learn: 0.0971916	test: 0.0952900	best: 0.0952900 (600)	total: 37.1s	remaining: 4m 31s
800:	learn: 0.0968525	test: 0.0950509	best: 0.0950509 (800)	total: 49.3s	remaining: 4m 18s
1000:	learn: 0.0965960	test: 0.0948989	best: 0.0948989 (1000)	total: 1m 1s	remaining: 4m 6s
1200:	learn: 0.0963903	test: 0.0947855	best: 0.0947855 (1200)	total: 1m 14s	remaining: 3m 54s
1400:	learn: 0.0962124	test: 0.0947028	best: 0.0947028 (1400)	total: 1m 26s	remaining: 3m 41s
1600:	learn: 0.0960661	test: 0.0946483	best: 0.0946483 (1600)	total: 1m 37s	remaining: 3m 28s
1800:	learn: 0.0959339	test: 0.0946000	best: 0.0946000 (1800)	total: 1m 50s	remaining: 3m 15s
2000:	learn: 0.0958095	test: 0.0945734	best: 0.0945734 (2000)	total: 2m 1s	remai

# Submission

In [32]:
from datetime import timedelta
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [33]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

In [34]:
df_train, df_eval = load_data_actions(df_clickstream, df_event)

In [35]:
import numpy as np

mean_by_cat = (
    df_text_features
    .join(df_cat_features["item", "node"], on="item", how="left")
    .to_pandas()
    .groupby("node")["title_projection"]
    .apply(lambda vs: np.mean(np.stack(vs.values), axis=0).tolist())
    .reset_index(name="mean_title_projection")
)

mean_by_cat

Unnamed: 0,node,mean_title_projection
0,1,"[-128.0, 127.0, 127.0, -40.0, -92.5, -109.5, 3..."
1,2,"[-128.0, 91.0, 127.0, -19.0, -72.0, -128.0, 11..."
2,3,"[-128.0, -30.0, 127.0, -119.0, -117.0, -128.0,..."
3,4,"[-128.0, 122.0, 127.0, -128.0, -86.0, -128.0, ..."
4,5,"[-128.0, 42.0, 127.0, -128.0, -80.0, -128.0, -..."
...,...,...
408469,424063,"[-128.0, -128.0, 127.0, -128.0, -111.0, 1.0, -..."
408470,424064,"[-128.0, -128.0, 127.0, -128.0, -126.0, -128.0..."
408471,424065,"[-105.0, -128.0, 15.0, -28.0, 68.0, -128.0, 63..."
408472,424067,"[-128.0, -128.0, 111.0, -35.0, 50.0, -89.0, -1..."


In [36]:
item_features = pl.DataFrame(mean_by_cat)
item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")

item_features = item_features.join(addit_features, how="left", on="node")

item_features = item_features.rename({"node": "id"})
unmelted_item_features = (
    item_features.unpivot(index="id", on=[x for x in item_features.columns if x != 'id'])
)
unmelted_item_features = unmelted_item_features.rename({"variable": "feature"})
item_features = item_features.rename({"id": "node"})
item_features.head()

  item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")


node,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11,field_12,field_13,field_14,field_15,field_16,field_17,field_18,field_19,field_20,field_21,field_22,field_23,field_24,field_25,field_26,field_27,field_28,field_29,field_30,field_31,field_32,field_33,field_34,field_35,…,field_47,field_48,field_49,field_50,field_51,field_52,field_53,field_54,field_55,field_56,field_57,field_58,field_59,field_60,field_61,field_62,field_63,attr_704,attr_2278,attr_1433,attr_1897,attr_1550,attr_567,attr_3031,attr_3392,attr_3025,attr_4622,attr_3911,attr_2140,attr_1971,attr_1092,attr_3509,attr_124,attr_3154,attr_1598,attr_3939,attr_3640
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,90.0,33.0,7.5,-72.5,-49.5,111.0,27.0,23.5,-98.5,98.5,-26.0,-94.0,-11.5,-57.5,127.0,-34.0,5.5,-103.0,-30.0,75.5,38.0,3.5,17.5,16.5,-19.5,-40.0,9.0,…,-77.5,-5.0,-89.0,44.0,-60.0,74.0,-63.0,50.0,-125.5,55.0,-23.0,-41.0,5.0,114.5,122.0,66.0,23.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,127.0,9.0,21.0,-128.0,-123.0,127.0,-3.0,9.0,-86.0,78.0,-79.0,-89.0,-59.0,-8.0,127.0,17.0,-51.0,-128.0,-108.0,127.0,-33.0,-23.0,24.0,-31.0,21.0,-123.0,-97.0,…,-28.0,-93.0,-53.0,44.0,-20.0,84.0,-45.0,97.0,-128.0,116.0,-4.0,-74.0,60.0,127.0,127.0,18.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,100.0,81.0,117.0,-15.0,-128.0,112.0,9.0,36.0,-128.0,127.0,-128.0,-97.0,80.0,-38.0,127.0,120.0,38.0,-128.0,-74.0,124.0,-40.0,59.0,-25.0,-39.0,66.0,-128.0,-114.0,…,-59.0,71.0,-10.0,48.0,41.0,89.0,-6.0,69.0,-56.0,127.0,-89.0,-97.0,7.0,92.0,25.0,-14.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,-13.0,112.0,-21.0,77.0,-54.0,75.0,115.0,39.0,-61.0,36.0,-98.0,-25.0,87.0,7.0,127.0,-6.0,50.0,-128.0,-128.0,104.0,-128.0,-43.0,-96.0,-43.0,44.0,-128.0,-128.0,…,-18.0,115.0,23.0,123.0,-45.0,127.0,-36.0,127.0,15.0,127.0,-34.0,-95.0,36.0,117.0,41.0,49.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,-17.0,127.0,-65.0,120.0,-68.0,52.0,79.0,52.0,-76.0,59.0,-114.0,-64.0,82.0,73.0,127.0,-3.0,85.0,-128.0,-105.0,26.0,-128.0,-35.0,45.0,35.0,31.0,-65.0,-107.0,…,-16.0,127.0,-36.0,66.0,-48.0,127.0,-47.0,106.0,-14.0,127.0,-63.0,48.0,32.0,81.0,-15.0,80.0,-31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()
df_train = dataframe2rectools(df_train)

KeyError: "None of [Index(['cookie', 'node', 'event_date'], dtype='object')] are in the [columns]"

In [38]:
df_final = dataframe2rectools(df_clickstream[["cookie", "node", "event_date"]])

# df_eval = df_eval.with_columns(pl.lit(1).alias("event_date"))
# df_eval = dataframe2rectools(df_eval)

In [39]:
df_final

Unnamed: 0,user_id,item_id,datetime,weight
0,0,115659,2025-02-05 02:30:59,1
1,0,115829,2025-01-24 21:16:57,1
2,1,7,2025-01-29 23:00:58,1
3,1,7,2025-02-17 14:55:17,1
4,1,214458,2025-01-17 19:23:29,1
...,...,...,...,...
68806147,149999,71511,2025-01-20 12:23:47,1
68806148,149999,71514,2025-01-24 14:26:57,1
68806149,149999,51162,2025-02-12 13:11:42,1
68806150,149999,71511,2025-02-16 12:35:35,1


In [40]:
candidates = pl.read_csv('data/test_hybrid_stage_candidates_ALS_200.csv')
candidates = candidates.to_pandas()

In [41]:
# candidates = candidates.to_pandas()
candidates['rank'] = candidates.groupby('cookie').cumcount() + 1 

item_features = item_features.to_pandas()
user_col = ['cookie']
item_col = item_features.columns
final_feat = (
    candidates
    # .merge(
    #     users[user_col],
    #     on=['user_id'],
    #     how='left')
    .merge(
        item_features[item_col],
        on=['node'],
        how='left')
)
X_final = final_feat.drop(drop_col, axis=1)

In [42]:
y_pred = ctb_model.predict_proba(X_final)

In [43]:
df_test_users

cookie
i64
52564
105000
57152
87303
37755
…
78910
64750
118889
131


In [45]:
final_feat["score"] = y_pred[:, 1]

In [None]:
del X_final
del candidates
del item_features

In [46]:
res = final_feat.sort_values(by=["cookie", "score"], ascending=[True, False]).groupby("cookie").head(40)

In [47]:
res = res[["cookie", "node"]].drop_duplicates()

In [48]:
res = pl.DataFrame(res)

In [49]:
addit = df_test_users.filter(pl.col("cookie").is_in(res["cookie"]) == False).with_columns(pl.lit(1).cast(pl.Int64).alias('node'))

In [51]:
addit

cookie,node
i64,i64


In [52]:
pl.concat([res, addit]).group_by("cookie").head(40)["cookie","node"]

cookie,node
i64,i64
96479,199971
96479,201038
96479,1922
96479,200005
96479,214241
…,…
145744,196156
145744,279316
145744,199422
145744,195969


In [53]:
pl.concat([res, addit]).group_by("cookie").head(40)["cookie","node"].write_csv('results/prediction_hybrid_good.csv')