# Feature Pipeline: Computing features

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

repo_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(repo_path)

import torch
import warnings
import polars as pl

from pprint import pprint
from loguru import logger
from sentence_transformers import SentenceTransformer

# Config
from recsys.config import settings

# BigQuery
from recsys.gcp.bigquery import client as bq_client

# Feature Store
from recsys.gcp.feature_store import client as fs_client

# Features
from recsys.core.features.article_features import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from recsys.core.features.ranking_features import compute_rankings_dataset
from recsys.core.features.customer_features import (
    DatasetSampler,
    compute_features_customers,
)
from recsys.core.features.interaction_features import generate_interaction_data
from recsys.core.features.transaction_features import compute_features_transactions

# Raw Data
from recsys.data.sources import h_and_m_data

warnings.filterwarnings("ignore")

In [2]:
ONLINE: bool = True

# 🗄️ Articles data

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [3]:
if ONLINE:
    articles_df = h_and_m_data.extract_articles_df()
else:
    articles_df = pl.read_csv(source=f"{repo_path}/data/articles.csv")

articles_df.shape

(105542, 25)

In [4]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


## Articles feature engineering

In [5]:
articles_df = compute_features_articles(articles_df, ONLINE, repo_path)
articles_df.shape

[32m2025-02-27 11:00:02.581[0m | [1mINFO    [0m | [36mrecsys.core.features.article_features[0m:[36mcompute_features_articles[0m:[36m102[0m - [1mComputing article features...[0m
[32m2025-02-27 11:00:02.856[0m | [1mINFO    [0m | [36mrecsys.core.features.article_features[0m:[36mcompute_features_articles[0m:[36m126[0m - [1mArticle feature computation complete[0m


(105542, 27)

In [6]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description,image_url
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""/Users/galcala/Desktop/Github/…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""/Users/galcala/Desktop/Github/…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…","""/Users/galcala/Desktop/Github/…"


## Create embeddings from the articles description

In [7]:
for i, desc in enumerate(articles_df["article_description"].head(n=3)):
    logger.info(f"Item {i + 1}:\n{desc}")

[32m2025-02-27 11:00:20.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
Strap top - Vest top in Garment Upper body
 Appearance: Solid
 Color: Dark Black 9
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m
[32m2025-02-27 11:00:20.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
Strap top - Vest top in Garment Upper body
 Appearance: Solid
 Color: Light White 10
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m
[32m2025-02-27 11:00:20.670[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 3:
Strap top (1) - Vest top in Garment Upper body
 Appearance: Stripe
 Color: Dusty Light White 11
 Category: Ladieswear Womens Everyday Basics Jersey Basic
 Details: Jersey top with narrow shoulder straps.[0m


In [8]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

logger.info(
    f"Loading ${settings.FEATURES_EMBEDDING_MODEL_ID} embedding model to {device=}"
)

# Load embedding model from SentenceTransformers model registry
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2025-02-27 11:00:23.439[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mLoading $all-MiniLM-L6-v2 embedding model to device='mps'[0m


In [9]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, "article_description", model, batch_size=128
)

Generating embeddings...:   0%|          | 0/105542 [00:00<?, ?it/s]

For each article description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [10]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description,image_url,embeddings
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str,str,list[f64]
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""/Users/galcala/Desktop/Github/…","[-0.022105, 0.065497, … 0.011869]"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""/Users/galcala/Desktop/Github/…","[-0.011306, 0.069066, … 0.014419]"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…","""/Users/galcala/Desktop/Github/…","[-0.033228, 0.077421, … 0.016747]"


In [None]:
articles_df[["article_description", "embeddings"]].head(3)

In [None]:
articles_df["image_url"][0]

In [11]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10pxl max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

# 👯‍♀️ Customers Data

In [None]:
if ONLINE:
    customers_df = h_and_m_data.extract_customers_df()
else:
    customers_df = pl.read_csv(source=f"{repo_path}/data/customers.csv")

customers_df.shape

In [None]:
customers_df.head(3)

In [None]:
customers_df.null_count()

## Customers feature engineering

In [None]:
customers_df.head(3)

In [None]:
customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.shape

In [None]:
customers_df.head(3)

# 🧾 Transactions Data

In [None]:
if ONLINE:
    transactions_df = h_and_m_data.extract_transactions_df()
else:
    transactions_df = pl.read_csv(source=f"{repo_path}/data/transactions_train.csv")

transactions_df.shape

In [None]:
transactions_df.head(3)

## Transactions feature engineering

In [None]:
transactions_df["t_dat"].head(3)

In [None]:
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

Thus, the features of the transactions DataFrame look as follows:

In [None]:
transactions_df.head(3)

We don't want to work with ~30 million transactions in these series, as everything will take too much time to run. Thus, we create a subset of the original dataset by randomly sampling from the customers' datasets and taking only their transactions.

In [None]:
sampler = DatasetSampler(size=settings.CUSTOMER_DATA_SIZE)

dataset_subset = sampler.sample(
    customers_df=customers_df, transactions_df=transactions_df
)

customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

In [None]:
transactions_df.shape

Some of the remaining customers:

In [None]:
for customer_id in transactions_df["customer_id"].unique().head(10):
    logger.info(f"Logging customer ID: {customer_id}")

# 🤳🏻 Interaction data

To train our models, we need more than just the transactions DataFrame. We need positive samples that signal whether a customer clicked or bought an item, but we also need negative samples that signal no interactions between a customer and an item.

In [None]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

In [None]:
interaction_df.head()

Score distributions:

Here is what each score means:
- `0` : No interaction between a customer and an item
- `1` : A customer clicked an item
- `2` : A customer bought an item

In [None]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

# Upload feature group

### Customers

In [None]:
customers_df.head()

In [None]:
logger.info("Uploading 'customers' Feature to BigQuery.")
bq_client.load_features(customers_df=customers_df)
logger.info("✅ Uploaded 'customers' Feature to BigQuery!")

### Articles

In [12]:
articles_df.collect_schema()

Schema([('article_id', String),
        ('product_code', Int64),
        ('prod_name', String),
        ('product_type_no', Int64),
        ('product_type_name', String),
        ('product_group_name', String),
        ('graphical_appearance_no', Int64),
        ('graphical_appearance_name', String),
        ('colour_group_code', Int64),
        ('colour_group_name', String),
        ('perceived_colour_value_id', Int64),
        ('perceived_colour_value_name', String),
        ('perceived_colour_master_id', Int64),
        ('perceived_colour_master_name', String),
        ('department_no', Int64),
        ('department_name', String),
        ('index_code', String),
        ('index_name', String),
        ('index_group_no', Int64),
        ('index_group_name', String),
        ('section_no', Int64),
        ('section_name', String),
        ('garment_group_no', Int64),
        ('garment_group_name', String),
        ('prod_name_length', UInt32),
        ('article_description', String),


In [20]:
logger.info("Uploading 'articles' Feature to BigQuery.")
bq_client.load_features(articles_df=articles_df)
logger.info("✅ Uploaded 'articles' Feature to BigQuery!")

[32m2025-02-27 14:07:06.616[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'articles' Feature to BigQuery.[0m
[32m2025-02-27 14:07:06.908[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-27 14:07:06.910[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted product_code to INTEGER[0m
[32m2025-02-27 14:07:06.912[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted prod_name to STRING[0m
[32m2025-02-27 14:07:06.913[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted product_type_no to INTEGER[0m
[32m2025-02-27 14:07:06.916[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [3

### Transactions

In [None]:
transactions_df.head()

In [None]:
logger.info("Uploading 'transactions' Feature to BigQuery.")
bq_client.load_features(transactions_df=transactions_df)
logger.info("✅ Uploaded 'transactions' Feature to BigQuery!")

### Interactions

In [None]:
interaction_df.head(3)

In [None]:
logger.info("Uploading 'interactions' Feature to BigQuery.")
bq_client.load_features(interactions_df=interaction_df)
logger.info("✅ Uploaded 'interactions' Feature to BigQuery!")

# Compute ranking dataset

The last step is to compute the ranking dataset used to train the scoring/ranking model from the feature groups we've just created:


In [38]:
fs_client.initialize()
fos = fs_client.get_client()

[32m2025-02-27 14:34:59.422[0m | [1mINFO    [0m | [36mrecsys.gcp.feature_store.client[0m:[36mget_client[0m:[36m31[0m - [1mRetrieving Feature Store from us-central1/recsys-dev-gonzo-2/recsys_feature_store_dev[0m


In [37]:
trans_fv, articles_fv, customers_fv, _ = fs_client.get_feature_views(fos)

In [39]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

ranking_df = compute_rankings_dataset(trans_fv, articles_fv, customers_fv)

[32m2025-02-27 14:35:00.387[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m61[0m - [1mComputing rankings dataset[0m
[32m2025-02-27 14:35:00.388[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m64[0m - [1mFetching transactions data...[0m
[32m2025-02-27 14:35:04.006[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m69[0m - [1mFetching articles data...[0m
[32m2025-02-27 14:35:36.825[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m74[0m - [1mFetching customers data...[0m
[32m2025-02-27 14:35:38.614[0m | [1mINFO    [0m | [36mrecsys.core.features.ranking_features[0m:[36mcompute_rankings_dataset[0m:[36m96[0m - [1mGenerating negative samples...[0m
[32m2025-02-27 14:35:38.620[0m | [1mINFO    [0m | [36mrecsys.core.fea

In [40]:
ranking_df.head(3)

customer_id,article_id,age,label,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length
str,str,f64,i32,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,i64
"""1e861829920faec106799ea4d401ff…","""831998007""",17.0,1,831998,"""micro tee""",255,"""T-shirt""","""Garment Upper body""",1010014,"""Placement print""",10,"""White""",3,"""Light""",9,"""White""",1640,"""Tops Fancy Jersey""","""D""","""Divided""",2,"""Divided""",53,"""Divided Collection""",1005,"""Jersey Fancy""",9
"""602a7857da345752e6a19c1575fd24…","""719348001""",22.0,1,719348,"""Bellora""",259,"""Shirt""","""Garment Upper body""",1010016,"""Solid""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1522,"""Blouse""","""A""","""Ladieswear""",1,"""Ladieswear""",15,"""Womens Everyday Collection""",1010,"""Blouses""",7
"""6acdcd51ee49b6771fb1104a07c0b4…","""608776002""",28.0,1,608776,"""Scallop 5p Socks""",302,"""Socks""","""Socks & Tights""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",3611,"""Shopbasket Socks""","""B""","""Lingeries/Tights""",1,"""Ladieswear""",62,"""Womens Nightwear, Socks & Tigh""",1021,"""Socks and Tights""",16


In [41]:
ranking_df.get_column("label").value_counts()

label,count
i32,u32
1,20376
0,20376


In [42]:
columns_to_keep = [
    "customer_id",
    "age",
    "article_id",
    "label",
    "product_type_name",
    "product_group_name",
    "graphical_appearance_name",
    "colour_group_name",
    "perceived_colour_value_name",
    "perceived_colour_master_name",
    "department_name",
    "index_name",
    "index_group_name",
    "section_name",
    "garment_group_name",
]

In [43]:
filtered_ranking_df = ranking_df[columns_to_keep]

As the ranking dataset was computed based on articles, customers, and transactions Feature Views, we can reflect this lineage in the ranking Feature View.

In [44]:
logger.info("Uploading 'rankings' Feature to BigQuery.")
bq_client.load_features(rankings_df=filtered_ranking_df)
logger.info("✅ Uploaded 'rankings' Feature to BigQuery!")

[32m2025-02-27 14:35:38.928[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUploading 'rankings' Feature to BigQuery.[0m
[32m2025-02-27 14:35:38.953[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted customer_id to STRING[0m
[32m2025-02-27 14:35:38.954[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted age to FLOAT64[0m
[32m2025-02-27 14:35:38.955[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted article_id to STRING[0m
[32m2025-02-27 14:35:38.955[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted label to INTEGER[0m
[32m2025-02-27 14:35:38.956[0m | [34m[1mDEBUG   [0m | [36mrecsys.gcp.bigquery.client[0m:[36mconvert_types[0m:[36m60[0m - [34m[1mConverted p