## <span style="color:#ff5f27">👩🏻‍🔬 Feature Engineering </span>


In [2]:
import time

# Start the timer
notebook_start_time = time.time()

## <span style="color:#ff5f27">📝 Imports </span>

In [1]:
import polars as pl
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

from features.articles import (
    prepare_articles, 
    generate_embeddings_for_dataframe,
    get_image_url,
)
from features.customers import prepare_customers
from features.transactions import prepare_transactions
from features.ranking import compute_ranking_dataset  

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [3]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://snurran.hops.works/p/17527
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27">🗄️ Read Articles Data</span>

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [4]:
# Start the timer
start_time = time.time()


# Load articles data
articles_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/articles.csv')
print(articles_df.shape)
articles_df.head(3)

(105542, 25)


article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


In [5]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 2.66 seconds


In [6]:
# Check for NaNs
articles_df.null_count()

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,416


## <span style="color:#ff5f27">👨🏻‍🏭 Articles Feature Engineering</span>


In [7]:
# Start the timer
start_time = time.time()


articles_df = prepare_articles(articles_df)
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…"


In [8]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 0.38 seconds


In [9]:
print(articles_df['article_description'][0])

Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Dark Black (Black)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.


## <span style="color:#ff5f27">🧬 Embeddings Creation</span>

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

2024-10-15 09:03:33,315 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-10-15 09:03:33,588 INFO: Use pytorch device: cuda


In [11]:
articles_df = generate_embeddings_for_dataframe(articles_df, 'article_description', model, device)

Generating embeddings:   0%|          | 0/105542 [00:00<?, ?it/s]

In [12]:
articles_df[['article_description', 'embeddings']].head(3)

article_description,embeddings
str,list[f64]
"""Strap top - Vest top in Garmen…","[-0.026782, 0.082344, … 0.022782]"
"""Strap top - Vest top in Garmen…","[-0.010396, 0.089874, … 0.022564]"
"""Strap top (1) - Vest top in Ga…","[-0.032753, 0.091124, … 0.022804]"


## <span style="color:#ff5f27">🔗 Image Links</span>

In [13]:
articles_df = articles_df.with_columns(
    image_url=pl.col("article_id").map_elements(get_image_url)
)
articles_df['image_url'][0]

'https://repo.hops.works/dev/jdowling/h-and-m/images/010/0108775015.jpg'

---
## <span style="color:#ff5f27">🗄️ Read Customers Data</span>

In [14]:
# Start the timer
start_time = time.time()


# Load customers data
customers_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/customers.csv')
print(customers_df.shape)
customers_df.head(3)

(1371980, 7)


customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [15]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 10.62 seconds


In [16]:
customers_df.null_count()

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
u32,u32,u32,u32,u32,u32,u32
0,895050,907576,6062,16009,15861,0


## <span style="color:#ff5f27">👨🏻‍🏭 Customers Feature Engineering</span>


In [17]:
# Start the timer
start_time = time.time()


customers_df = prepare_customers(customers_df)
customers_df.head(3)

customer_id,club_member_status,age,postal_code,age_group
str,str,f64,str,str
"""00000dbacae5abe5e23885899a1fa4…","""ACTIVE""",49.0,"""52043ee2162cf5aa7ee79974281641…","""46-55"""
"""0000423b00ade91418cceaf3b26c6a…","""ACTIVE""",25.0,"""2973abc54daa8a5f8ccfe9362140c6…","""19-25"""
"""000058a12d5b43e67d225668fa1f8d…","""ACTIVE""",24.0,"""64f17e6a330a85798e4998f62d0930…","""19-25"""


In [18]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 0.26 seconds


---
## <span style="color:#ff5f27">🗄️ Read Transactions Data</span>

In [19]:
# Start the timer
start_time = time.time()


trans_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/transactions_train.csv')[:1_000_000]
print(trans_df.shape)
trans_df.head(3)

(31788324, 5)


t_dat,customer_id,article_id,price,sales_channel_id
str,str,i64,f64,i64
"""2018-09-20""","""000058a12d5b43e67d225668fa1f8d…",663713001,0.050831,2
"""2018-09-20""","""000058a12d5b43e67d225668fa1f8d…",541518023,0.030492,2
"""2018-09-20""","""00007d2de826758b65a93dd24ce629…",505221004,0.015237,2


In [20]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 216.66 seconds


---
## Click Data

Features:

- t_dat

- customer_id

- article_id

- interaction_score (0/1)

- prev_article_id

---

## <span style="color:#ff5f27">👨🏻‍🏭 Transactions Feature Engineering</span>

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

In [21]:
# Start the timer
start_time = time.time()


trans_df = prepare_transactions(trans_df)
trans_df.head(3)

t_dat,customer_id,article_id,price,sales_channel_id,year,month,day,day_of_week,month_sin,month_cos
i64,str,str,f64,i64,i32,i8,i8,i8,f64,f64
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""663713001""",0.050831,2,2018,9,20,4,-1.0,-1.837e-16
1537401600000,"""000058a12d5b43e67d225668fa1f8d…","""541518023""",0.030492,2,2018,9,20,4,-1.0,-1.837e-16
1537401600000,"""00007d2de826758b65a93dd24ce629…","""505221004""",0.015237,2,2018,9,20,4,-1.0,-1.837e-16


In [22]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 5.32 seconds


In [23]:
print(f"⛳️ There are {trans_df.height:,} transactions in total.")

⛳️ There are 31,788,324 transactions in total.


In [24]:
#import random

#N_USERS = 25_000

# Consider only customers with age defined.
customers_df = customers_df.drop_nulls(subset=["age"])

# # Set a seed for reproducibility
# random.seed(27)

# # Sample N_USERS from the DataFrame
# customer_subset_df = customers_df.sample(n=N_USERS)

print(f"⛳️ There are {trans_df.height:,} transactions in total.")

⛳️ There are 31,788,324 transactions in total.


In [25]:
# trans_df = trans_df.join(customer_subset_df.select("customer_id"), on="customer_id")
trans_df = trans_df.join(customers_df.select("customer_id"), on="customer_id")

print(f"⛳️ Subset has {len(trans_df):,} transactions in total.")

⛳️ Subset has 31,648,066 transactions in total.


---
## <span style="color:#ff5f27">⚙️ To Pandas </span>


In [26]:
# Start the timer
customers_to_pandas_start_time = time.time()


customers_df = customers_df.to_pandas() # arrow = True
customers_df.head(3)


# End the timer
customers_to_pandas_end_time = time.time()

# Calculate and print the execution time
customers_to_pandas_execution_time = customers_to_pandas_end_time - customers_to_pandas_start_time
print(f"⛳️ Execution time: {customers_to_pandas_execution_time:.2f} seconds")

⛳️ Execution time: 0.88 seconds


In [27]:
# Start the timer
articles_to_pandas_start_time = time.time()


articles_df = articles_df.to_pandas()
articles_df.head(3)


# End the timer
articles_to_pandas_end_time = time.time()

# Calculate and print the execution time
articles_to_pandas_execution_time = articles_to_pandas_end_time - articles_to_pandas_start_time
print(f"⛳️ Execution time: {articles_to_pandas_execution_time:.2f} seconds")

⛳️ Execution time: 0.19 seconds


In [28]:
# Start the timer
trans_to_pandas_start_time = time.time()


trans_df = trans_df.to_pandas()
trans_df.head(3)


# End the timer
trans_to_pandas_end_time = time.time()

# Calculate and print the execution time
trans_to_pandas_execution_time = trans_to_pandas_end_time - trans_to_pandas_start_time
print(f"⛳️ Execution time: {trans_to_pandas_execution_time:.2f} seconds")

⛳️ Execution time: 6.79 seconds


---

## <span style="color:#ff5f27">🪄 Feature Group Creation </span>

A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features.

Before you can create a feature group you need to connect to your feature store.

To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group.

In [29]:
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    description="Customers data including age and postal code",
    version=1,
    primary_key=["customer_id"],
    online_enabled=True,
)

Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).

At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you populate it with its associated data using the `insert` method.

In [30]:
customers_fg.insert(customers_df)
print('✅ Done!')

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/17527/fs/17475/fg/20548


Uploading Dataframe: 0.00% |          | Rows 0/1356119 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: customers_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/17527/jobs/named/customers_1_offline_fg_materialization/executions
✅ Done!


In [31]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "club_member_status", "description": "Membership status of the customer in the club."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "postal_code", "description": "Postal code associated with the customer's address."},
    {"name": "age_group", "description": "Categorized age group of the customer."},
]

for desc in feature_descriptions: 
    customers_fg.update_feature_description(desc["name"], desc["description"])

Let's do the same thing for the rest of the data frames.

In [32]:
from hsfs.feature import Feature

features = [
    Feature(name='article_id', type='string', description="Identifier for the article."),
    Feature(name='product_code', type='bigint', description="Code associated with the product."),
    Feature(name='prod_name', type='string', description="Name of the product."),
    Feature(name='product_type_no', type='bigint', description="Number associated with the product type."),
    Feature(name='product_type_name', type='string', description="Name of the product type."),
    Feature(name='product_group_name', type='string', description="Name of the product group."),
    Feature(name='graphical_appearance_no', type='bigint', description="Number associated with graphical appearance."),
    Feature(name='graphical_appearance_name', type='string', description="Name of the graphical appearance."),
    Feature(name='colour_group_code', type='bigint', description="Code associated with the colour group."),
    Feature(name='colour_group_name', type='string', description="Name of the colour group."),
    Feature(name='perceived_colour_value_id', type='bigint', description="ID associated with perceived colour value."),
    Feature(name='perceived_colour_value_name', type='string', description="Name of the perceived colour value."),
    Feature(name='perceived_colour_master_id', type='bigint', description="ID associated with perceived colour master."),
    Feature(name='perceived_colour_master_name', type='string', description="Name of the perceived colour master."),
    Feature(name='department_no', type='bigint', description="Number associated with the department."),
    Feature(name='department_name', type='string', description="Name of the department."),
    Feature(name='index_code', type='string', description="Code associated with the index."),
    Feature(name='index_name', type='string', description="Name of the index."),
    Feature(name='index_group_no', type='bigint', description="Number associated with the index group."),
    Feature(name='index_group_name', type='string', description="Name of the index group."),
    Feature(name='section_no', type='bigint', description="Number associated with the section."),
    Feature(name='section_name', type='string', description="Name of the section."),
    Feature(name='garment_group_no', type='bigint', description="Number associated with the garment group."),
    Feature(name='garment_group_name', type='string', description="Name of the garment group."),
    Feature(name='prod_name_length', type='bigint', description="Length of the product name."),
    Feature(name='article_description', type='string', online_type="VARCHAR(5800)", description="Description of the article."),
    Feature(name='embeddings', type='array<double>', description="Vector embeddings of the article description."),
    Feature(name='image_url', type='string', description="URL of the product image."),
]

In [33]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings", 
    model.get_sentence_embedding_dimension(),
)

In [34]:
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    version=2,
    description="Fashion items data including type of item, visual description and category",
    primary_key=["article_id"],
    online_enabled=True,
    features=features,
    embedding_index=emb,
)
articles_fg.insert(articles_df)
print('✅ Done!')

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/17527/fs/17475/fg/20549


Uploading Dataframe: 0.00% |          | Rows 0/105542 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: articles_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/17527/jobs/named/articles_2_offline_fg_materialization/executions
✅ Done!


In [35]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transactions data including customer, item, price, sales channel and transaction date",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)
trans_fg.insert(
    trans_df,
    write_options={"wait_for_job": True},
)
print('✅ Done!')

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/17527/fs/17475/fg/20550


Uploading Dataframe: 0.00% |          | Rows 0/31648066 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/17527/jobs/named/transactions_1_offline_fg_materialization/executions
✅ Done!


In [36]:
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the data record."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "price", "description": "Price of the purchased article."},
    {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
    {"name": "year", "description": "Year of the transaction."},
    {"name": "month", "description": "Month of the transaction."},
    {"name": "day", "description": "Day of the transaction."},
    {"name": "day_of_week", "description": "Day of the week of the transaction."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
]

for desc in feature_descriptions: 
    trans_fg.update_feature_description(desc["name"], desc["description"])

## <span style="color:#ff5f27">📊 Ranking Dataset </span>


In [4]:
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    version=1,
)
    
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    version=2,
)
    
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
)

In [5]:
# Start the timer
start_time = time.time()

In [6]:
ranking_df = compute_ranking_dataset(
    trans_fg,
    articles_fg,
    customers_fg,
)
ranking_df.head(3)

Finished: Reading data from Hopsworks, using ArrowFlight (458.53s) 
Finished: Reading data from Hopsworks, using ArrowFlight (33.70s) 


Unnamed: 0,customer_id,age,month_sin,month_cos,article_id,label,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name
0,d32f22fb2dd1db40ee1bd9e1c214417be617244e8d169e...,32.0,-0.5,0.866025,663383001,1,Jumpsuit/Playsuit,Garment Full body,Solid,Off White,Dusty Light,White,Newborn,Baby Sizes 50-98,Baby/Children,Baby Essentials & Complements,Jersey Fancy
1,c98f7ee86869cf599961bbed74c84e2fcb8b4e648d4928...,29.0,-0.866025,0.5,663383001,1,Jumpsuit/Playsuit,Garment Full body,Solid,Off White,Dusty Light,White,Newborn,Baby Sizes 50-98,Baby/Children,Baby Essentials & Complements,Jersey Fancy
2,6d42330bc2e6e98d762e86fe1d6a06a5f3390885ec2bc3...,20.0,-0.866025,0.5,663383001,1,Jumpsuit/Playsuit,Garment Full body,Solid,Off White,Dusty Light,White,Newborn,Baby Sizes 50-98,Baby/Children,Baby Essentials & Complements,Jersey Fancy


In [7]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"⌛️ Execution time: {execution_time:.2f} seconds")

⌛️ Execution time: 527.46 seconds


In [8]:
ranking_df.label.value_counts()

label
0    45118340
1     4511834
Name: count, dtype: int64

In [9]:
rank_fg = fs.get_or_create_feature_group(
    name="ranking",
    version=1,
    description="Derived feature group for ranking",
    primary_key=["customer_id", "article_id"], 
    parents=[articles_fg, customers_fg, trans_fg],
)
rank_fg.insert(ranking_df)
print('✅ Done!')

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/17527/fs/17475/fg/20555


Uploading Dataframe: 0.00% |          | Rows 0/49630174 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/17527/jobs/named/ranking_1_offline_fg_materialization/executions
✅ Done!


In [10]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {"name": "graphical_appearance_name", "description": "Name of the graphical appearance."},
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {"name": "perceived_colour_value_name", "description": "Name of the perceived colour value."},
    {"name": "perceived_colour_master_name", "description": "Name of the perceived colour master."},
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {"name": "label", "description": "Label indicating whether the article was purchased (1) or not (0)."},
]

for desc in feature_descriptions: 
    rank_fg.update_feature_description(desc["name"], desc["description"])

You should now be able to inspect the feature groups in the Hopsworks UI.

In [11]:
# End the timer
notebook_end_time = time.time()

# Calculate and print the execution time
notebook_execution_time = notebook_end_time - notebook_start_time
print(f"⌛️ Notebook Execution time: {notebook_execution_time:.2f} seconds")

⌛️ Notebook Execution time: 2154.67 seconds


---
## <span style="color:#ff5f27">⏩️ Next Steps </span>
In the next notebook you'll train a retrieval model.