# Feature Pipeline: Computing features

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

repo_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(repo_path)

import torch
import warnings
import polars as pl

from pprint import pprint
from loguru import logger
from sentence_transformers import SentenceTransformer

# Config
from recsys.config import settings

# BigQuery
from recsys.gcp.bigquery import client as bq_client

# Feature Store
from recsys.gcp.feature_store import client as fs_client

# Features
from recsys.core.features.article_features import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from recsys.core.features.ranking_features import compute_rankings_dataset
from recsys.core.features.customer_features import (
    DatasetSampler,
    compute_features_customers,
)
from recsys.core.features.interaction_features import generate_interaction_data
from recsys.core.features.transaction_features import compute_features_transactions

# Raw Data
from recsys.data.sources import h_and_m_data

warnings.filterwarnings("ignore")

In [2]:
ONLINE: bool = False

# 🗄️ Articles data

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [None]:
if ONLINE:
    articles_df = h_and_m_data.extract_articles_df()
else:
    articles_df = pl.read_csv(source=f"{repo_path}/data/articles.csv")

articles_df.shape

In [None]:
articles_df.head(3)

## Articles feature engineering

In [None]:
articles_df = compute_features_articles(articles_df, ONLINE, repo_path)
articles_df.shape

In [None]:
articles_df.head(3)

## Create embeddings from the articles description

In [None]:
for i, desc in enumerate(articles_df["article_description"].head(n=3)):
    logger.info(f"Item {i + 1}:\n{desc}")

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

logger.info(
    f"Loading ${settings.FEATURES_EMBEDDING_MODEL_ID} embedding model to {device=}"
)

# Load embedding model from SentenceTransformers model registry
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

In [None]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, "article_description", model, batch_size=128
)

For each article description, we have a numerical vector which we can feed to a model, opposite to a string containing the description of an object.

In [None]:
articles_df.head(3)

In [None]:
articles_df[["article_description", "embeddings"]].head(3)

In [None]:
articles_df["image_url"][0]

In [None]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10pxl max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

# 👯‍♀️ Customers Data

In [None]:
if ONLINE:
    customers_df = h_and_m_data.extract_customers_df()
else:
    customers_df = pl.read_csv(source=f"{repo_path}/data/customers.csv")

customers_df.shape

In [None]:
customers_df.head(3)

In [None]:
customers_df.null_count()

## Customers feature engineering

In [None]:
customers_df.head(3)

In [None]:
customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.shape

In [None]:
customers_df.head(3)

# 🧾 Transactions Data

In [None]:
if ONLINE:
    transactions_df = h_and_m_data.extract_transactions_df()
else:
    transactions_df = pl.read_csv(source=f"{repo_path}/data/transactions_train.csv")

transactions_df.shape

In [None]:
transactions_df.head(3)

## Transactions feature engineering

In [None]:
transactions_df["t_dat"].head(3)

In [None]:
transactions_df = compute_features_transactions(transactions_df, ONLINE)
transactions_df.shape

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

Thus, the features of the transactions DataFrame look as follows:

In [None]:
transactions_df.head(3)

We don't want to work with ~30 million transactions in these series, as everything will take too much time to run. Thus, we create a subset of the original dataset by randomly sampling from the customers' datasets and taking only their transactions.

In [None]:
sampler = DatasetSampler(size=settings.CUSTOMER_DATA_SIZE)

dataset_subset = sampler.sample(
    customers_df=customers_df, transactions_df=transactions_df
)

customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

In [None]:
transactions_df.shape

Some of the remaining customers:

In [None]:
for customer_id in transactions_df["customer_id"].unique().head(10):
    logger.info(f"Logging customer ID: {customer_id}")

# 🤳🏻 Interaction data

To train our models, we need more than just the transactions DataFrame. We need positive samples that signal whether a customer clicked or bought an item, but we also need negative samples that signal no interactions between a customer and an item.

In [None]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

In [None]:
interaction_df.head()

Score distributions:

Here is what each score means:
- `0` : No interaction between a customer and an item
- `1` : A customer clicked an item
- `2` : A customer bought an item

In [None]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

# Upload feature group

### Customers

In [None]:
customers_df.head()

In [None]:
from recsys.config import settings
print(settings.BIGQUERY_DATASET_ID)

In [None]:
logger.info("Uploading 'customers' Feature to BigQuery.")
bq_client.load_features(customers_df=customers_df)
logger.info("✅ Uploaded 'customers' Feature to BigQuery!")

### Articles

In [None]:
articles_df.collect_schema()

In [None]:
logger.info("Uploading 'articles' Feature to BigQuery.")
bq_client.load_features(articles_df=articles_df)
logger.info("✅ Uploaded 'articles' Feature to BigQuery!")

### Transactions

In [None]:
transactions_df.head()

In [None]:
logger.info("Uploading 'transactions' Feature to BigQuery.")
bq_client.load_features(transactions_df=transactions_df)
logger.info("✅ Uploaded 'transactions' Feature to BigQuery!")

### Interactions

In [None]:
interaction_df.head(3)

In [None]:
logger.info("Uploading 'interactions' Feature to BigQuery.")
bq_client.load_features(interactions_df=interaction_df)
logger.info("✅ Uploaded 'interactions' Feature to BigQuery!")

# Compute ranking dataset

The last step is to compute the ranking dataset used to train the scoring/ranking model from the feature groups we've just created:


In [None]:
fs_client.initialize()
fos = fs_client.get_client()

In [41]:
trans_fv, articles_fv, customers_fv, _ = fs_client.get_feature_views(fos)

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

ranking_df = compute_rankings_dataset(trans_fv, articles_fv, customers_fv)

In [None]:
ranking_df.head(3)

In [None]:
ranking_df.get_column("label").value_counts()

In [45]:
columns_to_keep = [
    "customer_id",
    "age",
    "article_id",
    "label",
    "product_type_name",
    "product_group_name",
    "graphical_appearance_name",
    "colour_group_name",
    "perceived_colour_value_name",
    "perceived_colour_master_name",
    "department_name",
    "index_name",
    "index_group_name",
    "section_name",
    "garment_group_name",
]

In [46]:
filtered_ranking_df = ranking_df[columns_to_keep]

As the ranking dataset was computed based on articles, customers, and transactions Feature Views, we can reflect this lineage in the ranking Feature View.

In [None]:
logger.info("Uploading 'rankings' Feature to BigQuery.")
bq_client.load_features(rankings_df=filtered_ranking_df)
logger.info("✅ Uploaded 'rankings' Feature to BigQuery!")