# Feature Pipeline: Computing features

In [3]:
%load_ext autoreload
%autoreload 2

import os
import sys

repo_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(repo_path)

import torch
import warnings
import polars as pl

from pprint import pprint
from loguru import logger
from sentence_transformers import SentenceTransformer

# Config
from recsys.config import settings

# BigQuery
from recsys.gcp.bigquery import client as bq_client

# Feature Store
from recsys.gcp.feature_store import client as fs_client

# Features
from recsys.core.features.article_features import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from recsys.core.features.ranking_features import compute_rankings_dataset
from recsys.core.features.customer_features import (
    DatasetSampler,
    compute_features_customers,
)
from recsys.core.features.interaction_features import generate_interaction_data
from recsys.core.features.transaction_features import compute_features_transactions

# Raw Data
from recsys.data.sources import h_and_m_data

warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
ONLINE: bool = True

# 🗄️ Articles data

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [5]:
if ONLINE:
    articles_df = h_and_m_data.extract_articles_df()
else:
    articles_df = pl.read_csv(source=f"{repo_path}/data/articles.csv")

articles_df.shape

(105542, 25)

In [6]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


## Articles feature engineering

In [7]:
articles_df = compute_features_articles(articles_df, ONLINE, repo_path)
articles_df.shape

[32m2025-02-19 13:28:40.879[0m | [1mINFO    [0m | [36mrecsys.core.features.article_features[0m:[36mcompute_features_articles[0m:[36m98[0m - [1mComputing article features...[0m
[32m2025-02-19 13:28:41.238[0m | [1mINFO    [0m | [36mrecsys.core.features.article_features[0m:[36mcompute_features_articles[0m:[36m122[0m - [1mArticle feature computation complete[0m


(105542, 27)

In [8]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description,image_url
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""gs://gonzo-recsys-data/h-and-m…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""gs://gonzo-recsys-data/h-and-m…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…","""gs://gonzo-recsys-data/h-and-m…"


## Create embeddings from the articles description

In [9]:
for i, desc in enumerate(articles_df["article_description"].head(n=3)):
    logger.info(f"Item {i + 1}:\n{desc}")

[32m2025-02-19 13:28:45.240[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
Strap top - Vest top in Garment Upper body
 Appearance: Solid
 Color: Dark Black 9
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m
[32m2025-02-19 13:28:45.240[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
Strap top - Vest top in Garment Upper body
 Appearance: Solid
 Color: Light White 10
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m
[32m2025-02-19 13:28:45.240[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 3:
Strap top (1) - Vest top in Garment Upper body
 Appearance: Stripe
 Color: Dusty Light White 11
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m


In [10]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

logger.info(
    f"Loading ${settings.FEATURES_EMBEDDING_MODEL_ID} embedding model to {device=}"
)

# Load embedding model from SentenceTransformers model registry
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2025-02-19 13:28:48.692[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mLoading $all-MiniLM-L6-v2 embedding model to device='mps'[0m


In [12]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, "article_description", model, batch_size=128
)

Generating embeddings...:   0%|          | 0/105542 [00:00<?, ?it/s]

For each article description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [13]:
articles_df[["article_description", "embeddings"]].head(3)

article_description,embeddings
str,list[f64]
"""Strap top - Vest top in Garmen…","[-0.022105, 0.065497, … 0.011869]"
"""Strap top - Vest top in Garmen…","[-0.011306, 0.069066, … 0.014419]"
"""Strap top (1) - Vest top in Ga…","[-0.033228, 0.077421, … 0.016747]"


In [14]:
articles_df["image_url"][0]

'gs://gonzo-recsys-data/h-and-m/images/010/0108775015.jpg'

In [15]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10pxl max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

# 👯‍♀️ Customers Data

In [17]:
if ONLINE:
    customers_df = h_and_m_data.extract_customers_df()
else:
    customers_df = pl.read_csv(source=f"{repo_path}/data/customers.csv")

customers_df.shape

(1371980, 7)

In [18]:
customers_df.head(3)

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [19]:
customers_df.null_count()

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
u32,u32,u32,u32,u32,u32,u32
0,895050,907576,6062,16009,15861,0


## Customers feature engineering

In [20]:
customers_df.head(3)

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [21]:
customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.shape

[32m2025-02-19 13:36:19.240[0m | [1mINFO    [0m | [36mrecsys.core.features.customer_features[0m:[36mcompute_features_customers[0m:[36m129[0m - [1mComputing customer features...[0m
[32m2025-02-19 13:36:19.307[0m | [1mINFO    [0m | [36mrecsys.core.features.customer_features[0m:[36mcompute_features_customers[0m:[36m156[0m - [1mCustomer feature computation complete[0m


(1356119, 5)

In [22]:
customers_df.head(3)

customer_id,club_member_status,age,postal_code,age_group
str,str,f64,str,str
"""00000dbacae5abe5e23885899a1fa4…","""ACTIVE""",49.0,"""52043ee2162cf5aa7ee79974281641…","""46-55"""
"""0000423b00ade91418cceaf3b26c6a…","""ACTIVE""",25.0,"""2973abc54daa8a5f8ccfe9362140c6…","""19-25"""
"""000058a12d5b43e67d225668fa1f8d…","""ACTIVE""",24.0,"""64f17e6a330a85798e4998f62d0930…","""19-25"""


# 🧾 Transactions Data

In [24]:
if ONLINE:
    transactions_df = h_and_m_data.extract_transactions_df()
else:
    transactions_df = pl.read_csv(source=f"{repo_path}/data/transactions_train.csv")

transactions_df.shape

(31788324, 5)

In [25]:
transactions_df.head(3)

t_dat,customer_id,article_id,price,sales_channel_id
date,str,i64,f64,i64
2018-09-20,"""000058a12d5b43e67d225668fa1f8d…",663713001,0.050831,2
2018-09-20,"""000058a12d5b43e67d225668fa1f8d…",541518023,0.030492,2
2018-09-20,"""00007d2de826758b65a93dd24ce629…",505221004,0.015237,2


## Transactions feature engineering

In [26]:
transactions_df["t_dat"].head(3)

t_dat
date
2018-09-20
2018-09-20
2018-09-20


In [31]:
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

[32m2025-02-19 13:53:59.604[0m | [1mINFO    [0m | [36mrecsys.core.features.transaction_features[0m:[36mcompute_features_transactions[0m:[36m82[0m - [1mComputing transaction features...[0m


(31788324, 9)

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

Thus, the features of the transactions DataFrame look as follows:

In [32]:
transactions_df.head(3)

t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week
i64,str,str,f64,i64,i32,i8,i8,i8
0,"""000058a12d5b43e67d225668fa1f8d…","""663713001""",0.050831,2,2018,9,20,4
0,"""000058a12d5b43e67d225668fa1f8d…","""541518023""",0.030492,2,2018,9,20,4
0,"""00007d2de826758b65a93dd24ce629…","""505221004""",0.015237,2,2018,9,20,4


We don't want to work with ~30 million transactions in these series, as everything will take too much time to run. Thus, we create a subset of the original dataset by randomly sampling from the customers' datasets and taking only their transactions.

In [36]:
sampler = DatasetSampler(size=settings.CUSTOMER_DATA_SIZE)

dataset_subset = sampler.sample(
    customers_df=customers_df, transactions_df=transactions_df
)

customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

[32m2025-02-19 13:57:34.951[0m | [1mINFO    [0m | [36mrecsys.core.features.customer_features[0m:[36msample[0m:[36m53[0m - [1mSampling 1000 customers[0m
[32m2025-02-19 13:57:35.344[0m | [1mINFO    [0m | [36mrecsys.core.features.customer_features[0m:[36msample[0m:[36m57[0m - [1mOriginal transactions count: 31788324[0m
[32m2025-02-19 13:57:37.705[0m | [1mINFO    [0m | [36mrecsys.core.features.customer_features[0m:[36msample[0m:[36m62[0m - [1mFiltered transactions count: 23799[0m


In [37]:
transactions_df.shape

(23799, 9)

Some of the remaining customers:

In [38]:
for customer_id in transactions_df["customer_id"].unique().head(10):
    logger.info(f"Logging customer ID: {customer_id}")

[32m2025-02-19 13:57:43.983[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging customer ID: bc5f743c357366e8a1e00b00a81811f1bc95fa5552751dfca6bc00fc8702a872[0m
[32m2025-02-19 13:57:43.983[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging customer ID: dc507db500cae37a7fc775cf5cdc6f404fc758d0d987f9e5bc35632aacd2e25a[0m
[32m2025-02-19 13:57:43.983[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging customer ID: 98e888fd227e8d34803e62152c04d3697e0eaeda3c5a19616081f2b95eb114fa[0m
[32m2025-02-19 13:57:43.984[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging customer ID: b4397f1ef360771e411afa9b43ad9ad0e46d4b9a6d6cf29b1c4f37fdf894b787[0m
[32m2025-02-19 13:57:43.984[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLogging customer ID: 86cbcefd21772dfc655a0f91ee1045eda34a1941302413a1d9bf1bfeb2f9313c[0m
[32m2025-02-19

# 🤳🏻 Interaction data

To train our models, we need more than just the transactions DataFrame. We need positive samples that signal whether a customer clicked or bought an item, but we also need negative samples that signal no interactions between a customer and an item.

In [39]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

Processing customers: 100%|██████████| 1/1 [00:03<00:00,  3.47s/it]
[32m2025-02-19 13:57:50.277[0m | [1mINFO    [0m | [36mrecsys.core.features.interaction_features[0m:[36mgenerate_interaction_data[0m:[36m167[0m - [1mGenerated 135019 interactions[0m


(135019, 5)

In [40]:
interaction_df.head()

t_dat,customer_id,article_id,interaction_score,prev_article_id
i64,str,str,i64,str
-336960000,"""00b203a32faa3d007dba198ef27c15…","""710877002""",0,"""START"""
-335520000,"""00b203a32faa3d007dba198ef27c15…","""710877002""",0,"""710877002"""
-331560000,"""00b203a32faa3d007dba198ef27c15…","""568601018""",0,"""710877002"""
-330480000,"""00b203a32faa3d007dba198ef27c15…","""372628020""",0,"""568601018"""
-330120000,"""00b203a32faa3d007dba198ef27c15…","""589328001""",0,"""372628020"""


Score distributions:

Here is what each score means:
- `0` : No interaction between a customer and an item
- `1` : A customer clicked an item
- `2` : A customer bought an item

In [41]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

interaction_score,total_interactions
i64,u32
0,72969
1,38251
2,23799


# Upload feature group

### Customers

In [42]:
customers_df.head()

customer_id,club_member_status,age,postal_code,age_group
str,str,f64,str,str
"""6f173c0d871007b835f77d16232bae…","""ACTIVE""",61.0,"""aacae4bc0b6cf9c6708b05f234512a…","""56-65"""
"""1ae5ddce4cdf0f21ae41379a6edad9…","""ACTIVE""",23.0,"""2c29ae653a9282cce4151bd87643c9…","""19-25"""
"""b903a2e7b1fba1924a2332c1b69fc4…","""ACTIVE""",25.0,"""612ae24e8d0ea791da6b5b4de91714…","""19-25"""
"""f17d07ee3b52dc06ba23e5dbd0621a…","""ACTIVE""",33.0,"""caaf654c6a82da724a42162a80f037…","""26-35"""
"""27e13eb33e457d98f0f7fd086231a4…","""ACTIVE""",35.0,"""f919da058a4a967dd830c3cbb3c657…","""26-35"""


In [43]:
logger.info("Uploading 'customers' Feature to BigQuery.")
bq_client.load_features(customers_df=customers_df)
logger.info("✅ Uploaded 'customers' Feature to BigQuery!")

[32m2025-02-19 13:58:56.351[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'customers' Feature to BigQuery.[0m
[32m2025-02-19 13:58:56.603[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted customer_id to STRING[0m
[32m2025-02-19 13:58:56.603[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted club_member_status to STRING[0m
[32m2025-02-19 13:58:56.605[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted age to FLOAT[0m
[32m2025-02-19 13:58:56.605[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted postal_code to STRING[0m
[32m2025-02-19 13:58:56.606[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1

### Articles

In [44]:
logger.info("Uploading 'articles' Feature to BigQuery.")
bq_client.load_features(articles_df=articles_df)
logger.info("✅ Uploaded 'articles' Feature to BigQuery!")

[32m2025-02-19 13:59:34.055[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'articles' Feature to BigQuery.[0m
[32m2025-02-19 13:59:34.418[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-19 13:59:34.420[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted prod_name to STRING[0m
[32m2025-02-19 13:59:34.421[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted prod_name_length to INTEGER[0m
[32m2025-02-19 13:59:34.423[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted product_type_name to STRING[0m
[32m2025-02-19 13:59:34.424[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m 

### Transactions

In [45]:
transactions_df.head()

t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week
i64,str,str,f64,i64,i32,i8,i8,i8
0,"""25f0bcb3b9e6fffee7541b02d21e48…","""541518004""",0.030492,1,2018,9,20,4
0,"""25f0bcb3b9e6fffee7541b02d21e48…","""654564002""",0.016932,1,2018,9,20,4
0,"""25f0bcb3b9e6fffee7541b02d21e48…","""372860002""",0.013542,1,2018,9,20,4
0,"""4e82363f3c5a710922073cdf626309…","""673285001""",0.025407,1,2018,9,20,4
0,"""4e82363f3c5a710922073cdf626309…","""564314018""",0.025407,1,2018,9,20,4


In [46]:
logger.info("Uploading 'transactions' Feature to BigQuery.")
bq_client.load_features(transactions_df=transactions_df)
logger.info("✅ Uploaded 'transactions' Feature to BigQuery!")

[32m2025-02-19 14:02:02.472[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'transactions' Feature to BigQuery.[0m
[32m2025-02-19 14:02:02.508[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted t_dat to INTEGER[0m
[32m2025-02-19 14:02:02.509[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted customer_id to STRING[0m
[32m2025-02-19 14:02:02.510[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-19 14:02:02.510[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted price to FLOAT[0m
[32m2025-02-19 14:02:02.511[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConvert

### Interactions

In [47]:
interaction_df.head(3)

t_dat,customer_id,article_id,interaction_score,prev_article_id
i64,str,str,i64,str
-336960000,"""00b203a32faa3d007dba198ef27c15…","""710877002""",0,"""START"""
-335520000,"""00b203a32faa3d007dba198ef27c15…","""710877002""",0,"""710877002"""
-331560000,"""00b203a32faa3d007dba198ef27c15…","""568601018""",0,"""710877002"""


In [49]:
logger.info("Uploading 'interactions' Feature to BigQuery.")
bq_client.load_features(interactions_df=interaction_df)
logger.info("✅ Uploaded 'interactions' Feature to BigQuery!")

[32m2025-02-19 14:02:28.528[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'interactions' Feature to BigQuery.[0m
[32m2025-02-19 14:02:28.552[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted t_dat to INTEGER[0m
[32m2025-02-19 14:02:28.555[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted customer_id to STRING[0m
[32m2025-02-19 14:02:28.557[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-19 14:02:28.558[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted interaction_score to INTEGER[0m
[32m2025-02-19 14:02:28.560[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [

# Compute ranking dataset

The last step is to compute the ranking dataset used to train the scoring/ranking model from the feature groups we've just created:


In [51]:
fs_client.initialize()
fos = fs_client.get_client()

[32m2025-02-19 14:04:31.717[0m | [1mINFO    [0m | [36mrecsys.gcp.feature_store.client[0m:[36mget_client[0m:[36m32[0m - [1mRetrieving Feature Store from us-central1/recsys-dev-gonzo-2/recsys_feature_store_dev[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [53]:
trans_fv, articles_fv, customers_fv, _ = fs_client.get_feature_views(fos)

In [54]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

ranking_df = compute_rankings_dataset(trans_fv, articles_fv, customers_fv)

[32m2025-02-19 14:06:40.134[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m60[0m - [1mComputing rankings dataset[0m
[32m2025-02-19 14:06:40.135[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m63[0m - [1mFetching transactions data...[0m
[32m2025-02-19 14:06:45.415[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m69[0m - [1mFetching articles data...[0m
[32m2025-02-19 14:07:22.407[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m75[0m - [1mFetching customers data...[0m
[32m2025-02-19 14:07:25.289[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m100[0m - [1mGenerating negative samples...[0m
[32m2025-02-19 14:07:25.299[0m | [1mINFO    [0m | [36mrecsys.core.fe

In [55]:
ranking_df.head(3)

customer_id,article_id,age,label,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length
str,str,f64,i32,i64,str,i64,str,str,i64,str,str,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,i64
"""7d81802a361690bf74481f6f5bf684…","""818754003""",23.0,1,818754,"""Tanya mockneck LS""",254,"""Top""","""Garment Upper body""",1010016,"""Solid""","""52""","""Pink""",2,"""Medium Dusty""",4,"""Pink""",1643,"""Basic 1""","""D""","""Divided""",2,"""Divided""",51,"""Divided Basics""",1002,"""Jersey Basic""",17
"""6ef7673df03902dbbf2bc384734525…","""842360001""",36.0,1,842360,"""Veton dress party""",265,"""Dress""","""Garment Full body""",1010009,"""Glittering/Metallic""","""9""","""Black""",4,"""Dark""",5,"""Black""",1344,"""Dresses""","""D""","""Divided""",2,"""Divided""",53,"""Divided Collection""",1013,"""Dresses Ladies""",17
"""4cf36478ababf185ee5d5714e692ba…","""703843003""",22.0,1,703843,"""Demi l/s""",254,"""Top""","""Garment Upper body""",1010017,"""Stripe""","""9""","""Black""",4,"""Dark""",5,"""Black""",1636,"""Jersey fancy""","""A""","""Ladieswear""",1,"""Ladieswear""",15,"""Womens Everyday Collection""",1005,"""Jersey Fancy""",8


In [56]:
ranking_df.get_column("label").value_counts()

label,count
i32,u32
0,203760
1,20376


As the ranking dataset was computed based on articles, customers, and transactions Feature Views, we can reflect this lineage in the ranking Feature View.

In [57]:
logger.info("Uploading 'rankings' Feature to BigQuery.")
bq_client.load_features(rankings_df=ranking_df)
logger.info("✅ Uploaded 'rankings' Feature to BigQuery!")

[32m2025-02-19 14:07:42.360[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'rankings' Feature to BigQuery.[0m
[32m2025-02-19 14:07:42.481[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted customer_id to STRING[0m
[32m2025-02-19 14:07:42.485[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-19 14:07:42.485[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted age to FLOAT64[0m
[32m2025-02-19 14:07:42.488[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted label to INTEGER[0m
[32m2025-02-19 14:07:42.491[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m59[0m - [34m[1mConverted p