In [1]:
from datetime import timedelta
import polars as pl
import pandas as pd
import implicit

from tools import load_data_actions, generate_lightfm_recs_mapper
from tqdm import tqdm

from lightfm.data import Dataset
from lightfm import LightFM

from rectools import Columns
from rectools.dataset import Dataset as RTDataset
from rectools.models import (
    ImplicitALSWrapperModel,
    ImplicitBPRWrapperModel,
    LightFMWrapperModel,
    PureSVDModel,
    ImplicitItemKNNWrapperModel,
    EASEModel
)

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle



# First Stage

In [2]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

In [3]:
df_train, df_eval = load_data_actions(df_clickstream, df_event)

In [4]:
import numpy as np

mean_by_cat = (
    df_text_features
    .join(df_cat_features["item", "node"], on="item", how="left")
    .to_pandas()
    .groupby("node")["title_projection"]
    .apply(lambda vs: np.mean(np.stack(vs.values), axis=0).tolist())
    .reset_index(name="mean_title_projection")
)

mean_by_cat

Unnamed: 0,node,mean_title_projection
0,1,"[-128.0, 127.0, 127.0, -40.0, -92.5, -109.5, 3..."
1,2,"[-128.0, 91.0, 127.0, -19.0, -72.0, -128.0, 11..."
2,3,"[-128.0, -30.0, 127.0, -119.0, -117.0, -128.0,..."
3,4,"[-128.0, 122.0, 127.0, -128.0, -86.0, -128.0, ..."
4,5,"[-128.0, 42.0, 127.0, -128.0, -80.0, -128.0, -..."
...,...,...
408469,424063,"[-128.0, -128.0, 127.0, -128.0, -111.0, 1.0, -..."
408470,424064,"[-128.0, -128.0, 127.0, -128.0, -126.0, -128.0..."
408471,424065,"[-105.0, -128.0, 15.0, -28.0, 68.0, -128.0, 63..."
408472,424067,"[-128.0, -128.0, 111.0, -35.0, 50.0, -89.0, -1..."


In [5]:
item_features = pl.DataFrame(mean_by_cat)
item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")
item_features = item_features.rename({"node": "id"})
# unmelted_item_features = (
#     item_features.unpivot(index="id", on=[x for x in item_features.columns if x != 'id'])
# )
# unmelted_item_features = unmelted_item_features.rename({"variable": "feature"})
item_features.head()

  item_features = item_features.with_columns(pl.col("mean_title_projection").list.to_struct()).unnest("mean_title_projection")


id,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11,field_12,field_13,field_14,field_15,field_16,field_17,field_18,field_19,field_20,field_21,field_22,field_23,field_24,field_25,field_26,field_27,field_28,field_29,field_30,field_31,field_32,field_33,field_34,field_35,field_36,field_37,field_38,field_39,field_40,field_41,field_42,field_43,field_44,field_45,field_46,field_47,field_48,field_49,field_50,field_51,field_52,field_53,field_54,field_55,field_56,field_57,field_58,field_59,field_60,field_61,field_62,field_63
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,90.0,33.0,7.5,-72.5,-49.5,111.0,27.0,23.5,-98.5,98.5,-26.0,-94.0,-11.5,-57.5,127.0,-34.0,5.5,-103.0,-30.0,75.5,38.0,3.5,17.5,16.5,-19.5,-40.0,9.0,127.0,16.0,-85.5,17.0,97.5,-107.5,45.0,-65.0,3.0,28.5,-122.0,-77.5,-5.0,-89.0,44.0,-60.0,74.0,-63.0,50.0,-125.5,55.0,-23.0,-41.0,5.0,114.5,122.0,66.0,23.5
2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,127.0,9.0,21.0,-128.0,-123.0,127.0,-3.0,9.0,-86.0,78.0,-79.0,-89.0,-59.0,-8.0,127.0,17.0,-51.0,-128.0,-108.0,127.0,-33.0,-23.0,24.0,-31.0,21.0,-123.0,-97.0,127.0,66.0,-128.0,101.0,127.0,-109.0,43.0,-68.0,30.0,82.0,-128.0,-28.0,-93.0,-53.0,44.0,-20.0,84.0,-45.0,97.0,-128.0,116.0,-4.0,-74.0,60.0,127.0,127.0,18.0,-5.0
3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,100.0,81.0,117.0,-15.0,-128.0,112.0,9.0,36.0,-128.0,127.0,-128.0,-97.0,80.0,-38.0,127.0,120.0,38.0,-128.0,-74.0,124.0,-40.0,59.0,-25.0,-39.0,66.0,-128.0,-114.0,113.0,34.0,-128.0,29.0,127.0,-128.0,-37.0,34.0,76.0,43.0,-106.0,-59.0,71.0,-10.0,48.0,41.0,89.0,-6.0,69.0,-56.0,127.0,-89.0,-97.0,7.0,92.0,25.0,-14.0,36.0
4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,-13.0,112.0,-21.0,77.0,-54.0,75.0,115.0,39.0,-61.0,36.0,-98.0,-25.0,87.0,7.0,127.0,-6.0,50.0,-128.0,-128.0,104.0,-128.0,-43.0,-96.0,-43.0,44.0,-128.0,-128.0,13.0,127.0,-128.0,6.0,127.0,-128.0,-69.0,-15.0,48.0,127.0,-128.0,-18.0,115.0,23.0,123.0,-45.0,127.0,-36.0,127.0,15.0,127.0,-34.0,-95.0,36.0,117.0,41.0,49.0,9.0
5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,-17.0,127.0,-65.0,120.0,-68.0,52.0,79.0,52.0,-76.0,59.0,-114.0,-64.0,82.0,73.0,127.0,-3.0,85.0,-128.0,-105.0,26.0,-128.0,-35.0,45.0,35.0,31.0,-65.0,-107.0,-13.0,127.0,-128.0,-1.0,127.0,-45.0,-17.0,0.0,24.0,127.0,-76.0,-16.0,127.0,-36.0,66.0,-48.0,127.0,-47.0,106.0,-14.0,127.0,-63.0,48.0,32.0,81.0,-15.0,80.0,-31.0


In [6]:
def dataframe2rectools(df):
    return (
        df[["cookie", "node", "event_date"]]
        .with_columns(pl.lit(1).alias('weight'))
        .rename({
            "cookie": Columns.User,
            "node": Columns.Item,
            "weight": Columns.Weight,
            "event_date": Columns.Datetime,
        })
    ).to_pandas()
df_train = dataframe2rectools(df_train)

In [35]:
import polars as pl
import numpy as np
import faiss

class MeanEmbeddingRecommender:
    def __init__(self):
        self.user_embeddings = None  # user_id, mean_field_0 ... mean_field_63
        self.seen_items = None       # user_id, list[item_id]
        self.item_ids = None          # item_id array aligned with FAISS index
        self.faiss_index = None
        self.embedding_fields = None # Names of embedding fields

    def fit(self, df_clickstream: pl.DataFrame, df_text_vec: pl.DataFrame):
        # Merge to get user-item embeddings
        merged = df_clickstream.join(df_text_vec, on="item_id", how="inner")
        
        # Collect seen items per user
        self.seen_items = df_clickstream.group_by("user_id").agg(
            pl.col("item_id").alias("seen_items")
        )

        # Get embedding column names (field_0 ... field_63)
        self.embedding_fields = [col for col in df_text_vec.columns if col.startswith("field_")]
        
        # Compute mean embeddings per user
        self.user_embeddings = merged.group_by("user_id").agg(
            [pl.col(field).mean().alias(f"mean_{field}") for field in self.embedding_fields]
        )

        # Prepare FAISS index
        items_embeddings = df_text_vec.select(self.embedding_fields).to_numpy()
        items_embeddings = np.ascontiguousarray(items_embeddings, dtype='float32')
        
        # Normalize and create index
        self.item_ids = df_text_vec["item_id"].to_numpy()
        d = items_embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(d)
        faiss.normalize_L2(items_embeddings)
        self.faiss_index.add(items_embeddings)

    def recommend(self, user_id: str, k: int = 10) -> pl.DataFrame:
        # Get user embedding
        user_data = self.user_embeddings.filter(pl.col("user_id") == user_id)
        if user_data.is_empty():
            return pl.DataFrame({"item_id": [], "score": []})
        
        # Extract and normalize user embedding
        user_embedding = user_data.select(self.embedding_fields).to_numpy().astype('float32')
        faiss.normalize_L2(user_embedding)
        
        # Get seen items
        seen_data = self.seen_items.filter(pl.col("user_id") == user_id)
        seen_items = set() if seen_data.is_empty() else set(seen_data["seen_items"].item())
        
        # Determine retrieval size
        seen_count = len(seen_items)
        retrieve_k = min(k + seen_count, self.faiss_index.ntotal)
        if retrieve_k <= 0:
            return pl.DataFrame({"item_id": [], "score": []})
        
        # Search FAISS index
        scores, indices = self.faiss_index.search(user_embedding, retrieve_k)
        scores = scores[0]
        indices = indices[0]
        
        # Map to item IDs and filter seen items
        candidates = [(self.item_ids[i], scores[i]) for i in indices if self.item_ids[i] not in seen_items]
        
        # Select top-k remaining
        top_k = sorted(candidates, key=lambda x: -x[1])[:k]
        if not top_k:
            return pl.DataFrame({"item_id": [], "score": []})
        
        return pl.DataFrame({
            "item_id": [item[0] for item in top_k],
            "score": [item[1] for item in top_k]
        })

import polars as pl
import numpy as np
from annoy import AnnoyIndex

import polars as pl
import numpy as np
from annoy import AnnoyIndex

class LightweightRecommender:
    def __init__(self, embedding_dim=64, n_trees=20):
        self.embedding_dim = embedding_dim
        self.n_trees = n_trees
        self.user_means = None      # user_id, mean_embedding (list)
        self.seen_items = None      # user_id -> list[item_id]
        self.item_mapping = None    # item_id -> embedding
        self.annoy_index = None
        self.embedding_fields = None

    def fit(self, df_clickstream: pl.LazyFrame, df_text_vec: pl.LazyFrame):
        # Process embeddings
        self.embedding_fields = sorted(
            [col for col in df_text_vec.columns if col.startswith("field_")],
            key=lambda x: int(x.split('_')[1])
        )
        
        # Join data and process in streaming mode
        merged = df_clickstream.lazy().join(
            df_text_vec.lazy(),
            on="item_id",
            how="inner"
        )

        # Compute user means (correct aggregation)
        self.user_means = merged.group_by("user_id").agg(
            pl.col("item_id").alias("seen_items"),
            pl.concat_list([pl.col(f).mean() for f in self.embedding_fields]).alias("mean_embedding")
        ).collect(streaming=True)

        # Build ANNOY index
        self.item_mapping = df_text_vec.lazy().collect(streaming=True)
        self.annoy_index = AnnoyIndex(self.embedding_dim, 'dot')
        
        # Add items to index
        for i, row in enumerate(self.item_mapping.iter_rows(named=True)):
            embedding = [row[f] for f in self.embedding_fields]
            self.annoy_index.add_item(i, embedding)
        
        self.annoy_index.build(self.n_trees)

    def recommend(self, user_id: str, k: int = 10) -> pl.DataFrame:
        # Handle missing users
        user_data = self.user_means.filter(pl.col("user_id") == user_id)
        if user_data.is_empty():
            return pl.DataFrame({"item_id": [], "score": []})

        # Get user embedding and seen items
        mean_embedding = np.array(
            user_data["mean_embedding"].item(), 
            dtype=np.float32
        )
        seen_items = set(user_data["seen_items"].item())

        # Find approximate neighbors
        item_ids, scores = self.annoy_index.get_nns_by_vector(
            mean_embedding,
            n=k + len(seen_items),
            include_distances=True
        )

        # Convert to cosine similarity (1 - angular distance^2/2)
        # cosine_scores = 1 - (np.array(scores)**2)/2

        # Filter seen items and format results
        results = []
        all_items = self.item_mapping["item_id"].to_list()
        for i, score in zip(item_ids, scores):
            item_id = all_items[i]
            if item_id not in seen_items:
                results.append({"item_id": item_id, "score": score})
                if len(results) >= k:
                    break

        return pl.DataFrame(results)

In [8]:
item_features = item_features.rename({"id": "item_id"})

In [9]:
df_train = df_train[["user_id", "item_id"]]

In [10]:
item_features

item_id,field_0,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10,field_11,field_12,field_13,field_14,field_15,field_16,field_17,field_18,field_19,field_20,field_21,field_22,field_23,field_24,field_25,field_26,field_27,field_28,field_29,field_30,field_31,field_32,field_33,field_34,field_35,field_36,field_37,field_38,field_39,field_40,field_41,field_42,field_43,field_44,field_45,field_46,field_47,field_48,field_49,field_50,field_51,field_52,field_53,field_54,field_55,field_56,field_57,field_58,field_59,field_60,field_61,field_62,field_63
u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,-128.0,127.0,127.0,-40.0,-92.5,-109.5,3.5,-44.0,106.0,90.0,33.0,7.5,-72.5,-49.5,111.0,27.0,23.5,-98.5,98.5,-26.0,-94.0,-11.5,-57.5,127.0,-34.0,5.5,-103.0,-30.0,75.5,38.0,3.5,17.5,16.5,-19.5,-40.0,9.0,127.0,16.0,-85.5,17.0,97.5,-107.5,45.0,-65.0,3.0,28.5,-122.0,-77.5,-5.0,-89.0,44.0,-60.0,74.0,-63.0,50.0,-125.5,55.0,-23.0,-41.0,5.0,114.5,122.0,66.0,23.5
2,-128.0,91.0,127.0,-19.0,-72.0,-128.0,11.0,-128.0,127.0,127.0,9.0,21.0,-128.0,-123.0,127.0,-3.0,9.0,-86.0,78.0,-79.0,-89.0,-59.0,-8.0,127.0,17.0,-51.0,-128.0,-108.0,127.0,-33.0,-23.0,24.0,-31.0,21.0,-123.0,-97.0,127.0,66.0,-128.0,101.0,127.0,-109.0,43.0,-68.0,30.0,82.0,-128.0,-28.0,-93.0,-53.0,44.0,-20.0,84.0,-45.0,97.0,-128.0,116.0,-4.0,-74.0,60.0,127.0,127.0,18.0,-5.0
3,-128.0,-30.0,127.0,-119.0,-117.0,-128.0,-73.0,-16.0,127.0,100.0,81.0,117.0,-15.0,-128.0,112.0,9.0,36.0,-128.0,127.0,-128.0,-97.0,80.0,-38.0,127.0,120.0,38.0,-128.0,-74.0,124.0,-40.0,59.0,-25.0,-39.0,66.0,-128.0,-114.0,113.0,34.0,-128.0,29.0,127.0,-128.0,-37.0,34.0,76.0,43.0,-106.0,-59.0,71.0,-10.0,48.0,41.0,89.0,-6.0,69.0,-56.0,127.0,-89.0,-97.0,7.0,92.0,25.0,-14.0,36.0
4,-128.0,122.0,127.0,-128.0,-86.0,-128.0,-128.0,-36.0,127.0,-13.0,112.0,-21.0,77.0,-54.0,75.0,115.0,39.0,-61.0,36.0,-98.0,-25.0,87.0,7.0,127.0,-6.0,50.0,-128.0,-128.0,104.0,-128.0,-43.0,-96.0,-43.0,44.0,-128.0,-128.0,13.0,127.0,-128.0,6.0,127.0,-128.0,-69.0,-15.0,48.0,127.0,-128.0,-18.0,115.0,23.0,123.0,-45.0,127.0,-36.0,127.0,15.0,127.0,-34.0,-95.0,36.0,117.0,41.0,49.0,9.0
5,-128.0,42.0,127.0,-128.0,-80.0,-128.0,-128.0,-18.0,112.0,-17.0,127.0,-65.0,120.0,-68.0,52.0,79.0,52.0,-76.0,59.0,-114.0,-64.0,82.0,73.0,127.0,-3.0,85.0,-128.0,-105.0,26.0,-128.0,-35.0,45.0,35.0,31.0,-65.0,-107.0,-13.0,127.0,-128.0,-1.0,127.0,-45.0,-17.0,0.0,24.0,127.0,-76.0,-16.0,127.0,-36.0,66.0,-48.0,127.0,-47.0,106.0,-14.0,127.0,-63.0,48.0,32.0,81.0,-15.0,80.0,-31.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
424063,-128.0,-128.0,127.0,-128.0,-111.0,1.0,-36.0,98.0,103.0,-44.0,127.0,-67.0,38.0,-128.0,75.0,69.0,127.0,-103.0,-21.0,43.0,-32.0,56.0,-34.0,127.0,54.0,-19.0,-10.0,-4.0,34.0,-36.0,-86.0,-93.0,-67.0,-1.0,-79.0,-128.0,81.0,42.0,-102.0,-126.0,-22.0,-128.0,-128.0,6.0,23.0,20.0,1.0,-39.0,127.0,-128.0,-38.0,-19.0,127.0,-31.0,-87.0,-26.0,127.0,122.0,-14.0,-18.0,52.0,127.0,-3.0,-69.0
424064,-128.0,-128.0,127.0,-128.0,-126.0,-128.0,47.0,127.0,84.0,-79.0,127.0,44.0,127.0,-128.0,127.0,54.0,-41.0,-128.0,66.0,-103.0,89.0,-44.0,29.0,117.0,-100.0,97.0,-128.0,-5.0,118.0,8.0,33.0,-34.0,57.0,-84.0,-128.0,-94.0,-57.0,101.0,-128.0,-99.0,30.0,-128.0,13.0,-79.0,59.0,63.0,-128.0,17.0,67.0,101.0,42.0,-115.0,18.0,-116.0,-100.0,-45.0,127.0,-73.0,43.0,38.0,-55.0,127.0,44.0,14.0
424065,-105.0,-128.0,15.0,-28.0,68.0,-128.0,63.0,123.0,-3.0,-113.0,127.0,-128.0,127.0,-128.0,127.0,112.0,-104.0,-128.0,54.0,-82.0,-100.0,81.0,-11.0,127.0,1.0,64.0,-128.0,-128.0,93.0,-128.0,25.0,-77.0,127.0,-128.0,-78.0,-24.0,97.0,-7.0,-80.0,2.0,35.0,-128.0,-128.0,-29.0,-23.0,47.0,-69.0,24.0,49.0,0.0,-30.0,17.0,22.0,127.0,127.0,-128.0,-8.0,-33.0,-67.0,-36.0,-15.0,127.0,-19.0,-57.0
424067,-128.0,-128.0,111.0,-35.0,50.0,-89.0,-11.0,127.0,123.0,-111.0,127.0,-15.0,127.0,-110.0,51.0,91.0,-106.0,-73.0,81.0,-128.0,19.0,-72.0,-128.0,102.0,-14.0,72.0,-128.0,-128.0,122.0,-44.0,-128.0,99.0,96.0,22.0,-128.0,-128.0,113.0,67.0,-128.0,63.0,118.0,-121.0,-93.0,4.0,23.0,63.0,19.0,7.0,39.0,125.0,4.0,22.0,48.0,-128.0,43.0,-128.0,64.0,-102.0,-87.0,-51.0,53.0,104.0,-128.0,-9.0


In [36]:
recommender = LightweightRecommender()
recommender.fit(pl.DataFrame(df_train), item_features)

  ).collect(streaming=True)

More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  ).collect(streaming=True)
  self.item_mapping = df_text_vec.lazy().collect(streaming=True)

More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  self.item_mapping = df_text_vec.lazy().collect(streaming=True)


In [37]:
df_eval

cookie,node,event
i64,u32,i64
140658,232301,10
3456,85601,10
88368,26351,15
104498,214466,10
133830,51168,19
…,…,…
111036,234699,10
28511,51164,5
85224,298594,15
68851,165675,10


In [38]:
users = df_eval["cookie"].unique().to_numpy()

In [39]:
res = pl.DataFrame()
for u_id in tqdm(users, total=users.shape[0]):
    r = recommender.recommend(u_id, 40)
    r = r.with_columns(pl.lit(u_id).alias("cookie"))
    res = pl.concat([res, r])

100%|██████████| 55081/55081 [08:12<00:00, 111.92it/s]


In [40]:
res.with_columns(pl.col("score").mul(-1))

item_id,score,cookie
i64,f64,i32
227851,-254203.703125,0
306200,-249524.65625,0
118828,-248641.03125,0
318696,-247911.0625,0
140689,-245918.046875,0
…,…,…
152334,-277845.28125,149998
151741,-277104.0,149998
152538,-274664.03125,149998
152057,-274187.21875,149998


In [41]:
res.sort(by=["item_id", "score"])

item_id,score,cookie
i64,f64,i32
2,208919.03125,101820
2,211957.828125,66515
2,213673.5625,113190
2,214136.828125,90146
2,214841.296875,107525
…,…,…
424032,296849.46875,130783
424032,305396.125,73217
424042,353179.625,77237
424067,258446.4375,11163


In [42]:
res.with_columns(pl.col("score").mul(-1))

item_id,score,cookie
i64,f64,i32
227851,-254203.703125,0
306200,-249524.65625,0
118828,-248641.03125,0
318696,-247911.0625,0
140689,-245918.046875,0
…,…,…
152334,-277845.28125,149998
151741,-277104.0,149998
152538,-274664.03125,149998
152057,-274187.21875,149998


In [43]:
from tools import recall_at
df_pred = pl.DataFrame(
    res.rename({"cookie": "user_id"})[["user_id", "item_id"]],
    schema={"user_id": pl.Int64, "item_id": pl.Int64}
    ).rename({"user_id": "cookie", "item_id": "node"})
recall_at(df_eval, df_pred, k=40)

0.0012325348781361566

In [22]:
df_pred.group_by("cookie").head(40)["cookie","node"].write_csv('data/first_stage_prediction_ANN_40.csv')

# Submission

In [8]:
SEED = 42
top_N = 40
DATA_DIR = 'data/'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features_preproc_20.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

In [9]:
df_train = dataframe2rectools(df_clickstream)

In [10]:
del df_clickstream
del df_eval
del df_cat_features
del df_text_features
del df_event

In [11]:
from typing import Iterable, List, Any
import pandas as pd

def chunker(seq: List[Any], batch_size: int) -> Iterable[List[Any]]:
    """
    Yield successive batch_size‐sized chunks from seq.
    """
    for i in range(0, len(seq), batch_size):
        yield seq[i : i + batch_size]

In [12]:
df_train

Unnamed: 0,user_id,item_id,datetime,weight
0,0,115659,2025-02-05 02:30:59,1
1,0,115829,2025-01-24 21:16:57,1
2,1,7,2025-01-29 23:00:58,1
3,1,7,2025-02-17 14:55:17,1
4,1,214458,2025-01-17 19:23:29,1
...,...,...,...,...
68806147,149999,71511,2025-01-20 12:23:47,1
68806148,149999,71514,2025-01-24 14:26:57,1
68806149,149999,51162,2025-02-12 13:11:42,1
68806150,149999,71511,2025-02-16 12:35:35,1


In [30]:
all_results = []
batch_size = df_test_users.shape[0] // 4
print(batch_size)
for batch_users in tqdm(chunker(df_test_users, batch_size)):
    batch_users = batch_users.to_numpy().flatten()
    batch_df = df_train[df_train["user_id"].isin(batch_users)]
    recommender = LightweightRecommender()
    recommender.fit(pl.DataFrame(batch_df), item_features.rename({"id": "item_id"}))

    res = pl.DataFrame()
    for u_id in tqdm(batch_users, total=batch_users.shape[0]):
        r = recommender.recommend(u_id, 40)
        r = r.with_columns(pl.lit(u_id).alias("cookie"))
        res = pl.concat([res, r])

    all_results.append(res)

23079


  ).collect(streaming=True)

More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  ).collect(streaming=True)
  self.item_mapping = df_text_vec.lazy().collect(streaming=True)

More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  self.item_mapping = df_text_vec.lazy().collect(streaming=True)
100%|██████████| 23079/23079 [02:42<00:00, 141.62it/s]
100%|██████████| 23079/23079 [02:40<00:00, 143.86it/s]
100%|██████████| 23079/23079 [02:38<00:00, 145.17it/s]
100%|██████████| 23079/23079 [02:51<00:00, 134.72it/s]
100%|██████████| 3/3 [00:00<00:00, 157.09it/s]
5it [11:15, 135.11s/it]


In [33]:
final_rec = pl.concat(all_results)

In [34]:
final_rec.rename({"item_id": "node"})

item_id,score,cookie
i64,f64,i32
259245,-4252.691298,52564
259288,-4621.843477,52564
166430,-4844.847181,52564
115669,-5668.684475,52564
170542,-5773.388956,52564
…,…,…
189742,-26028.694748,37487
231965,-26292.010587,37487
231962,-26386.805769,37487
120739,-26399.500323,37487


In [36]:
final_rec.rename({"item_id": "node"}).group_by("cookie").head(40)["cookie","node"].write_csv('data/test_prediction_ANN_40.csv')