# Kaggle Competition Example

In [1]:
import os
from typing import Optional
from pathlib import Path
import logging
from datetime import datetime, timedelta

import snowflake.snowpark.functions as F
from snowflake.snowpark import Window

from snowflake_feature_store.connection import get_connection
from snowflake_feature_store.manager import FeatureStoreManager
from snowflake_feature_store.config import (
    FeatureViewConfig, FeatureConfig, RefreshConfig, 
    FeatureValidationConfig
)
from snowflake_feature_store.transforms import (
    Transform, TransformConfig, moving_agg, 
    fill_na, date_diff, CustomTransform
)
from snowflake_feature_store.examples import (
    get_example_data, create_feature_configs
)
from snowflake_feature_store.logging import logger

In [2]:

# Connect to Snowflake and create feature store
conn = get_connection(
    database="DATASCIENCE",
    schema="FEATURE_STORE",
    create_objects=True
)

2025-03-03 17:22:26,166 - snowflake_feature_store - INFO - No active session found, creating new connection from environment
2025-03-03 17:22:26,878 - snowflake_feature_store - INFO - Initialized connection to "DATASCIENCE"."FEATURE_STORE"
2025-03-03 17:22:27,778 - snowflake_feature_store - INFO - Using role: "ACCOUNTADMIN", warehouse: "CONTAINER_DEMO_WH", database: DATASCIENCE, schema: FEATURE_STORE


In [3]:

# Create a feature store manager
manager = FeatureStoreManager(
    connection=conn,
    overwrite=True
)


2025-03-03 17:22:30,633 - snowflake_feature_store - INFO - FeatureStoreManager initialized



# 1. Define Entities


In [4]:
# User entity
manager.add_entity(
    name="USER",
    join_keys=["USER_ID"],
    description="Instacart users who place orders"
)

# Product entity
manager.add_entity(
    name="PRODUCT",
    join_keys=["PRODUCT_ID"],
    description="Products available in Instacart"
)

# User-Product entity
manager.add_entity(
    name="USER_PRODUCT",
    join_keys=["USER_ID", "PRODUCT_ID"],
    description="Interactions between users and products"
)


  return f(self, *args, **kargs)
  return f(self, *args, **kargs)
  return f(self, *args, **kargs)


<snowflake_feature_store.manager.FeatureStoreManager at 0x143615090>


# 2. Create base DataFrames for features


### User features


In [45]:
user_features_df = conn.session.sql("""
SELECT 
    USER_ID,
    COUNT(DISTINCT ORDER_ID) AS USER_TOTAL_ORDERS,
    AVG(DAYS_SINCE_PRIOR_ORDER) AS AVG_DAYS_BETWEEN_ORDERS,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ORDER_HOUR_OF_DAY) AS TYPICAL_ORDER_HOUR,
    MODE(ORDER_DOW) AS PREFERRED_ORDER_DAY,
    AVG(BASKET_SIZE) AS AVG_BASKET_SIZE,
    COUNT(DISTINCT PRODUCT_ID) AS DISTINCT_PRODUCTS_COUNT,
    SUM(REORDERED) / NULLIF(COUNT(REORDERED), 0) AS USER_REORDER_RATE
FROM (
    SELECT 
        o.USER_ID, 
        o.ORDER_ID,
        o.ORDER_DOW,
        o.ORDER_HOUR_OF_DAY,
        o.DAYS_SINCE_PRIOR_ORDER,
        COUNT(op.PRODUCT_ID) AS BASKET_SIZE,
        op.PRODUCT_ID,
        op.REORDERED,
        DATEADD('day', o.order_dow, TO_DATE('2023-01-01')) AS synthetic_date
    FROM INSTACART_RAW.ORDERS o
    JOIN INSTACART_RAW.ORDER_PRODUCTS op ON o.ORDER_ID = op.ORDER_ID
    WHERE o.EVAL_SET = 'prior'
    GROUP BY 1, 2, 3, 4, 5, 7, 8
) user_orders
GROUP BY USER_ID
""")



### Product features


In [46]:
product_features_df = conn.session.sql("""
SELECT 
    p.PRODUCT_ID,
    p.AISLE_ID,
    p.DEPARTMENT_ID,
    COUNT(DISTINCT op.ORDER_ID) AS PRODUCT_ORDERS,
    SUM(op.REORDERED) AS PRODUCT_REORDERS,
    SUM(op.REORDERED) / NULLIF(COUNT(CASE WHEN op.REORDERED = 1 THEN 1 END), 0) AS PRODUCT_REORDER_RATE,
    AVG(op.ADD_TO_CART_ORDER) AS PRODUCT_AVG_CART_POSITION
FROM INSTACART_RAW.PRODUCTS p
JOIN INSTACART_RAW.ORDER_PRODUCTS op ON p.PRODUCT_ID = op.PRODUCT_ID
JOIN INSTACART_RAW.ORDERS o ON op.ORDER_ID = o.ORDER_ID
WHERE o.EVAL_SET = 'prior'
GROUP BY p.PRODUCT_ID, p.AISLE_ID, p.DEPARTMENT_ID
""")



# User-Product features


In [47]:
user_product_features_df = conn.session.sql("""
WITH user_product_history AS (
    SELECT 
        o.USER_ID,
        op.PRODUCT_ID,
        o.ORDER_ID,
        o.ORDER_NUMBER,
        o.ORDER_DOW,
        o.ORDER_HOUR_OF_DAY,
        op.REORDERED,
        op.ADD_TO_CART_ORDER,
        ROW_NUMBER() OVER (PARTITION BY o.USER_ID, op.PRODUCT_ID ORDER BY o.ORDER_NUMBER DESC) AS rn,
        COUNT(*) OVER (PARTITION BY o.USER_ID, op.PRODUCT_ID) AS UP_ORDERS,
        AVG(op.ADD_TO_CART_ORDER) OVER (PARTITION BY o.USER_ID, op.PRODUCT_ID) AS UP_AVG_CART_POSITION
    FROM INSTACART_RAW.ORDERS o
    JOIN INSTACART_RAW.ORDER_PRODUCTS op ON o.ORDER_ID = op.ORDER_ID
    WHERE o.EVAL_SET = 'prior'
),
user_last_order AS (
    SELECT 
        USER_ID, 
        MAX(ORDER_NUMBER) AS LAST_ORDER_NUMBER
    FROM INSTACART_RAW.ORDERS
    WHERE EVAL_SET = 'prior'
    GROUP BY USER_ID
)
SELECT 
    h.USER_ID,
    h.PRODUCT_ID,
    h.ORDER_ID,
    h.ORDER_NUMBER,
    DATEADD('day', h.order_dow, TO_DATE('2023-01-01')) AS synthetic_date,
    h.ORDER_DOW,
    h.ORDER_HOUR_OF_DAY,
    h.REORDERED,
    h.UP_ORDERS,
    h.UP_AVG_CART_POSITION,
    l.LAST_ORDER_NUMBER - h.ORDER_NUMBER AS ORDERS_SINCE_LAST_PURCHASE
FROM user_product_history h
JOIN user_last_order l ON h.USER_ID = l.USER_ID
WHERE h.rn = 1
""")



# 3. Define Feature Configurations


In [48]:
# User feature configs
user_feature_configs = {
    "USER_TOTAL_ORDERS": FeatureConfig(
        name="USER_TOTAL_ORDERS",
        description="Total number of orders placed by user",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "AVG_DAYS_BETWEEN_ORDERS": FeatureConfig(
        name="AVG_DAYS_BETWEEN_ORDERS",
        description="Average days between orders",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=0)
    ),
    "TYPICAL_ORDER_HOUR": FeatureConfig(
        name="TYPICAL_ORDER_HOUR",
        description="Median hour of day when user places orders",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=0, max_value=23)
    ),
    "PREFERRED_ORDER_DAY": FeatureConfig(
        name="PREFERRED_ORDER_DAY",
        description="Most common day of week for orders (0=Sunday)",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=0, max_value=6)
    ),
    "AVG_BASKET_SIZE": FeatureConfig(
        name="AVG_BASKET_SIZE",
        description="Average number of products per order",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "DISTINCT_PRODUCTS_COUNT": FeatureConfig(
        name="DISTINCT_PRODUCTS_COUNT",
        description="Number of unique products ordered",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "USER_REORDER_RATE": FeatureConfig(
        name="USER_REORDER_RATE",
        description="Proportion of products that are reordered",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=0, max_value=1)
    ),
    "DOMINANT_DAY_PART": FeatureConfig(
        name="DOMINANT_DAY_PART",
        description="Most common time of day for orders",
        validation=FeatureValidationConfig(null_threshold=0.1)
    )
}


In [49]:

# Product feature configs
product_feature_configs = {
    "AISLE_ID": FeatureConfig(
        name="AISLE_ID",
        description="Aisle ID for the product",
        validation=FeatureValidationConfig(null_threshold=0.0)
    ),
    "DEPARTMENT_ID": FeatureConfig(
        name="DEPARTMENT_ID",
        description="Department ID for the product",
        validation=FeatureValidationConfig(null_threshold=0.0)
    ),
    "PRODUCT_ORDERS": FeatureConfig(
        name="PRODUCT_ORDERS",
        description="Number of orders containing this product",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "PRODUCT_REORDERS": FeatureConfig(
        name="PRODUCT_REORDERS",
        description="Number of times this product was reordered",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=0)
    ),
    "PRODUCT_REORDER_RATE": FeatureConfig(
        name="PRODUCT_REORDER_RATE",
        description="Proportion of orders that are reorders",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=0, max_value=1)
    ),
    "PRODUCT_AVG_CART_POSITION": FeatureConfig(
        name="PRODUCT_AVG_CART_POSITION",
        description="Average position in cart",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "DEPARTMENT_POPULARITY_RANK": FeatureConfig(
        name="DEPARTMENT_POPULARITY_RANK",
        description="Popularity rank within department",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=1)
    ),
    "AISLE_POPULARITY_RANK": FeatureConfig(
        name="AISLE_POPULARITY_RANK",
        description="Popularity rank within aisle",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=1)
    )
}


In [50]:
# User-Product feature configs
user_product_feature_configs = {
    "ORDER_DOW": FeatureConfig(
        name="ORDER_DOW",
        description="Day of week for the order"
    ),
    "SYNTHETIC_DATE": FeatureConfig(
        name="SYNTHETIC_DATE",
        description="Created Date For Feature Store"
    ),
    "ORDER_HOUR_OF_DAY": FeatureConfig(
        name="ORDER_HOUR_OF_DAY",
        description="Hour of day for the order"
    ),
    "REORDERED": FeatureConfig(
        name="REORDERED",
        description="Whether the product was reordered"
    ),
    "UP_ORDERS": FeatureConfig(
        name="UP_ORDERS",
        description="Number of times user ordered this product",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "UP_AVG_CART_POSITION": FeatureConfig(
        name="UP_AVG_CART_POSITION",
        description="Average cart position for this user-product",
        validation=FeatureValidationConfig(null_threshold=0.0, range_check=True, min_value=1)
    ),
    "ORDERS_SINCE_LAST_PURCHASE": FeatureConfig(
        name="ORDERS_SINCE_LAST_PURCHASE",
        description="Number of orders since user last purchased this product",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=0)
    ),
    "UP_ORDERS_RATIO": FeatureConfig(
        name="UP_ORDERS_RATIO",
        description="Ratio of orders containing this product to total user orders",
        validation=FeatureValidationConfig(null_threshold=0.1, range_check=True, min_value=0, max_value=1)
    ),
    "PURCHASE_RECENCY_BUCKET": FeatureConfig(
        name="PURCHASE_RECENCY_BUCKET",
        description="Recency category of last purchase (recent, medium, old)",
        validation=FeatureValidationConfig(null_threshold=0.1)
    )
}



# 4. Define Feature View Configs


In [51]:
# User feature view config
user_config = FeatureViewConfig(
    name="user_features",
    domain="INSTACART",
    entity="USER",
    feature_type="BEHAVIOR",
    refresh=RefreshConfig(frequency="1 day", mode="INCREMENTAL"),
    features=user_feature_configs,
    description="User behavior features for Instacart market basket prediction"
)


In [52]:

# Product feature view config
product_config = FeatureViewConfig(
    name="product_features",
    domain="INSTACART",
    entity="PRODUCT",
    feature_type="ATTRIBUTE",
    refresh=RefreshConfig(frequency="1 day", mode="INCREMENTAL"),
    features=product_feature_configs,
    description="Product features for Instacart market basket prediction"
)


In [53]:
# User-Product feature view config
user_product_config = FeatureViewConfig(
    name="user_product_features",
    domain="INSTACART",
    entity="USER_PRODUCT",
    feature_type="INTERACTION",
    refresh=RefreshConfig(frequency="1 day", mode="INCREMENTAL"),
    features=user_product_feature_configs,  # Updated feature configs
    description="User-product interaction features for Instacart market basket prediction"
)



# 5. Define Transformations


### User transformations


In [54]:
user_transform_config = TransformConfig(
    name="user_transforms",
    null_threshold=0.1,
    expected_types=['DECIMAL', 'DOUBLE', 'NUMBER', 'INT', 'STRING']
)

user_transforms = [
    CustomTransform(
        transform_func=lambda df: df.with_column(
            'DOMINANT_DAY_PART',
            F.when(F.col("TYPICAL_ORDER_HOUR") < 6, F.lit("night"))
             .when(F.col("TYPICAL_ORDER_HOUR") < 12, F.lit("morning"))
             .when(F.col("TYPICAL_ORDER_HOUR") < 18, F.lit("midday"))
             .otherwise(F.lit("evening"))
        ),
        config=user_transform_config
    ),
    CustomTransform(
        transform_func=lambda df: df.with_column(
            'DOMINANT_DOW',
            F.col("PREFERRED_ORDER_DAY")
        ),
        config=user_transform_config
    ),
    
    # Fill nulls in numeric columns
    fill_na(
        cols=["AVG_DAYS_BETWEEN_ORDERS", "AVG_BASKET_SIZE"],
        fill_value=0.0,
        config=user_transform_config
    )
]


### Product transformations


In [55]:
product_transform_config = TransformConfig(
    name="product_transforms",
    null_threshold=0.1,
    expected_types=['DECIMAL', 'DOUBLE', 'NUMBER', 'INT']
)

# Corrected product transforms with proper Window syntax
product_transforms = [
    # Add department popularity rank
    CustomTransform(
        transform_func=lambda df: df.with_column(
            'DEPARTMENT_POPULARITY_RANK',
            F.dense_rank().over(
                Window.partition_by("DEPARTMENT_ID").order_by(F.col("PRODUCT_ORDERS").desc())
            )
        ),
        config=product_transform_config
    ),
    
    # Add aisle popularity rank
    CustomTransform(
        transform_func=lambda df: df.with_column(
            'AISLE_POPULARITY_RANK',
            F.dense_rank().over(
                Window.partition_by("AISLE_ID").order_by(F.col("PRODUCT_ORDERS").desc())
            )
        ),
        config=product_transform_config
    )
]



### User-Product transformations


In [56]:
print("Available columns in user_product_features_df:")
print(user_product_features_df.columns)

# Now let's create a simplified version of the transforms that will work
user_product_transform_config = TransformConfig(
    name="user_product_transforms",
    null_threshold=0.1,
    expected_types=['DECIMAL', 'DOUBLE', 'NUMBER', 'INT', 'STRING']
)

user_product_transforms = [
    # Just add UP_ORDERS_RATIO column
    CustomTransform(
        transform_func=lambda df: df.with_column(
            "UP_ORDERS_RATIO", (F.col("UP_ORDERS") / F.lit(1.0))
        ),
        config=user_product_transform_config
    ),
    
    # Add PURCHASE_RECENCY_BUCKET column
    CustomTransform(
        transform_func=lambda df: df.with_column(
            "PURCHASE_RECENCY_BUCKET", 
            F.when(F.col("ORDERS_SINCE_LAST_PURCHASE") == 0, F.lit("recent"))
             .when(F.col("ORDERS_SINCE_LAST_PURCHASE") <= 3, F.lit("medium"))
             .otherwise(F.lit("old"))
        ),
        config=user_product_transform_config
    )
]


Available columns in user_product_features_df:
['USER_ID', 'PRODUCT_ID', 'ORDER_ID', 'ORDER_NUMBER', 'SYNTHETIC_DATE', 'ORDER_DOW', 'ORDER_HOUR_OF_DAY', 'REORDERED', 'UP_ORDERS', 'UP_AVG_CART_POSITION', 'ORDERS_SINCE_LAST_PURCHASE']



# 6. Create Feature Views


In [57]:
user_feature_view = manager.add_feature_view(
    config=user_config,
    df=user_features_df,
    entity_name="USER",
    transforms=user_transforms,
    collect_stats=True
)

Input value type doesn't match the target column data type, this replacement was skipped. Column Name: "AVG_BASKET_SIZE", Type: DecimalType(36, 6), Input Value: 0.0, Type: <class 'str'>


2025-03-03 17:31:53,939 - snowflake_feature_store - INFO - Validated feature USER_TOTAL_ORDERS (stats: {'timestamp': '2025-03-04T01:31:43.864602', 'row_count': 206209, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 97, 'min_value': 3.0, 'max_value': 99.0, 'mean_value': 15.590367, 'std_value': 16.65477348990373})
2025-03-03 17:32:03,939 - snowflake_feature_store - INFO - Validated feature AVG_DAYS_BETWEEN_ORDERS (stats: {'timestamp': '2025-03-04T01:31:54.383766', 'row_count': 206209, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 99570, 'min_value': 0.0, 'max_value': 30.0, 'mean_value': 15.469669692770578, 'std_value': 7.207435949657587})
2025-03-03 17:32:21,807 - snowflake_feature_store - INFO - Validated feature TYPICAL_ORDER_HOUR (stats: {'timestamp': '2025-03-04T01:32:08.452892', 'row_count': 206209, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 47, 'min_value': 0.0, 'max_value': 23.0, 'mean_value': 13.52991625, 'std_value': 2.849790670923568})
2025-03-03 17:32:36,39

  self._check_dynamic_table_refresh_mode(feature_view_name)


In [58]:
product_feature_view = manager.add_feature_view(
    config=product_config,
    df=product_features_df,
    entity_name="PRODUCT",
    transforms=product_transforms,
    collect_stats=True
)

2025-03-03 17:33:59,434 - snowflake_feature_store - INFO - Validated feature AISLE_ID (stats: {'timestamp': '2025-03-04T01:33:54.478943', 'row_count': 49677, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 134, 'min_value': 1.0, 'max_value': 134.0, 'mean_value': 67.769189, 'std_value': 38.317847251639805})
2025-03-03 17:34:04,542 - snowflake_feature_store - INFO - Validated feature DEPARTMENT_ID (stats: {'timestamp': '2025-03-04T01:33:59.753146', 'row_count': 49677, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 21, 'min_value': 1.0, 'max_value': 21.0, 'mean_value': 11.727802, 'std_value': 5.8503070004915125})
2025-03-03 17:34:14,571 - snowflake_feature_store - INFO - Validated feature PRODUCT_ORDERS (stats: {'timestamp': '2025-03-04T01:34:07.033387', 'row_count': 49677, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 4161, 'min_value': 1.0, 'max_value': 472565.0, 'mean_value': 652.907563, 'std_value': 4792.114415774002})
2025-03-03 17:34:18,909 - snowflake_feature_store -

  self._check_dynamic_table_refresh_mode(feature_view_name)


In [59]:
# Now try creating the feature view again
user_product_feature_view = manager.add_feature_view(
    config=user_product_config,
    df=user_product_features_df,
    entity_name="USER_PRODUCT",
    transforms=user_product_transforms,
    collect_stats=True
)

2025-03-03 17:35:35,926 - snowflake_feature_store - INFO - Validated feature ORDER_DOW (stats: {'timestamp': '2025-03-04T01:35:12.678529', 'row_count': 13307953, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 7, 'min_value': 0.0, 'max_value': 6.0, 'mean_value': 2.753886, 'std_value': 2.0996980735334305})
2025-03-03 17:35:52,093 - snowflake_feature_store - INFO - Validated feature SYNTHETIC_DATE (stats: {'timestamp': '2025-03-04T01:35:44.005099', 'row_count': 13307953, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 7, 'min_value': None, 'max_value': None, 'mean_value': None, 'std_value': None})
2025-03-03 17:36:16,029 - snowflake_feature_store - INFO - Validated feature ORDER_HOUR_OF_DAY (stats: {'timestamp': '2025-03-04T01:35:52.431130', 'row_count': 13307953, 'null_count': 0, 'null_ratio': 0.0, 'unique_count': 24, 'min_value': 0.0, 'max_value': 23.0, 'mean_value': 13.514454, 'std_value': 4.226126358735621})
2025-03-03 17:36:38,680 - snowflake_feature_store - INFO - Validated

  self._check_dynamic_table_refresh_mode(feature_view_name)



# 7. Generate Training Dataset


In [56]:
# Create spine for training data
spine_df = conn.session.sql("""
SELECT 
    o.USER_ID,
    op.PRODUCT_ID,
    o.ORDER_ID,
    o.ORDER_NUMBER,
    o.ORDER_DOW,
    o.ORDER_HOUR_OF_DAY,
    op.REORDERED
FROM INSTACART_RAW.ORDERS o
JOIN INSTACART_RAW.ORDER_PRODUCTS op ON o.ORDER_ID = op.ORDER_ID
WHERE o.EVAL_SET = 'train'
""")

spine_df.show(5)

------------------------------------------------------------------------------------------------------------
|"USER_ID"  |"PRODUCT_ID"  |"ORDER_ID"  |"ORDER_NUMBER"  |"ORDER_DOW"  |"ORDER_HOUR_OF_DAY"  |"REORDERED"  |
------------------------------------------------------------------------------------------------------------
|112108     |49302         |1           |4               |4            |10                   |1            |
|112108     |11109         |1           |4               |4            |10                   |1            |
|112108     |10246         |1           |4               |4            |10                   |0            |
|112108     |49683         |1           |4               |4            |10                   |0            |
|112108     |43633         |1           |4               |4            |10                   |1            |
------------------------------------------------------------------------------------------------------------



In [57]:
# Generate training dataset with all features
training_data = manager.get_features(
    spine_df=spine_df,
    feature_views=[user_config, product_config, user_product_config],
    label_cols=["REORDERED"],
    spine_timestamp_col=None  # No timestamp needed for this dataset
)
training_data.show(5)

2025-02-28 17:46:13,023 - snowflake_feature_store - INFO - Spine DataFrame columns: ['USER_ID', 'PRODUCT_ID', 'ORDER_ID', 'ORDER_NUMBER', 'ORDER_DOW', 'ORDER_HOUR_OF_DAY', 'REORDERED']
2025-02-28 17:46:13,024 - snowflake_feature_store - INFO - Spine DataFrame schema: StructType([StructField('USER_ID', LongType(), nullable=True), StructField('PRODUCT_ID', LongType(), nullable=True), StructField('ORDER_ID', LongType(), nullable=True), StructField('ORDER_NUMBER', LongType(), nullable=True), StructField('ORDER_DOW', LongType(), nullable=True), StructField('ORDER_HOUR_OF_DAY', LongType(), nullable=True), StructField('REORDERED', LongType(), nullable=True)])
2025-02-28 17:46:19,091 - snowflake_feature_store - INFO - Generating dataset with name: DATASET_20250301_014619_d1812ae8
2025-02-28 17:46:19,092 - snowflake_feature_store - INFO - Label columns: ['"REORDERED"']
2025-02-28 17:46:19,093 - snowflake_feature_store - INFO - Timestamp column: None


FeatureStoreException: Error generating dataset: (1300) An error occurred during dataset generation: (1304): 01bab4ea-0004-a5f9-004d-de07052ae67a: 002028 (42601): SQL compilation error:
ambiguous column name 'ORDER_ID'.

In [43]:
# 1. First, let's check the column names in each feature view DataFrame
print("User feature columns:", user_features_df.columns)
print("Product feature columns:", product_features_df.columns)
print("User-Product feature columns:", user_product_features_df.columns)

User feature columns: ['USER_ID', 'USER_TOTAL_ORDERS', 'AVG_DAYS_BETWEEN_ORDERS', 'TYPICAL_ORDER_HOUR', 'PREFERRED_ORDER_DAY', 'AVG_BASKET_SIZE', 'DISTINCT_PRODUCTS_COUNT', 'USER_REORDER_RATE']
Product feature columns: ['PRODUCT_ID', 'AISLE_ID', 'DEPARTMENT_ID', 'PRODUCT_ORDERS', 'PRODUCT_REORDERS', 'PRODUCT_REORDER_RATE', 'PRODUCT_AVG_CART_POSITION']
User-Product feature columns: ['USER_ID', 'PRODUCT_ID', 'ORDER_ID', 'ORDER_NUMBER', 'ORDER_DOW', 'ORDER_HOUR_OF_DAY', 'REORDERED', 'UP_ORDERS', 'UP_AVG_CART_POSITION', 'ORDERS_SINCE_LAST_PURCHASE']


In [None]:

# Add additional basket-level features
training_data = training_data.with_columns([
    F.count("*").over(F.Window.partition_by("ORDER_ID")).alias("BASKET_SIZE"),
    F.sum(F.col("REORDERED")).over(F.Window.partition_by("USER_ID")) / 
    F.count("*").over(F.Window.partition_by("USER_ID")).alias("REORDER_RATIO"),
    F.count_distinct("AISLE_ID").over(F.Window.partition_by("ORDER_ID")).alias("UNIQUE_AISLES"),
    F.count_distinct("DEPARTMENT_ID").over(F.Window.partition_by("ORDER_ID")).alias("UNIQUE_DEPARTMENTS"),
    F.when(F.col("ORDER_HOUR_OF_DAY") < 6, F.lit("night"))
     .when(F.col("ORDER_HOUR_OF_DAY") < 12, F.lit("morning"))
     .when(F.col("ORDER_HOUR_OF_DAY") < 18, F.lit("midday"))
     .otherwise(F.lit("evening")).alias("DAY_PART")
])



# 8. Save the training dataset


In [None]:
training_data.write.mode("overwrite").save_as_table("INSTACART_FEATURES.TRAINING_DATA")



# 9. (Optional) Monitor Feature Drift


In [37]:
# Create a simple monitoring function
def check_feature_drift(manager, feature_view_name, new_data):
    """Check for feature drift in new data"""
    drift_results = manager.check_feature_drift(
        feature_view_name=feature_view_name,
        new_data=new_data
    )
    
    if drift_results:
        logger.warning(f"Drift detected in {feature_view_name}:")
        for feature, metrics in drift_results.items():
            logger.warning(f"  {feature}: {metrics}")
    else:
        logger.info(f"No significant drift detected in {feature_view_name}")
    
    return drift_results

In [None]:

# Example usage for monitoring
# check_feature_drift(manager, "user_features", new_user_data)



# 10. Define prediction function


In [None]:
def predict_next_order(user_id, model_path=None):
    """Predict items for a user's next order"""
    # Get user features
    user_features = manager.get_features(
        spine_df=conn.session.create_dataframe([[user_id]], schema=["USER_ID"]),
        feature_views=[user_config]
    )
    
    # Get user-product features for this user
    user_products = conn.session.sql(f"""
    SELECT USER_ID, PRODUCT_ID 
    FROM INSTACART_RAW.ORDER_PRODUCTS op
    JOIN INSTACART_RAW.ORDERS o ON op.ORDER_ID = o.ORDER_ID
    WHERE USER_ID = {user_id}
    """)
    
    # Get full features for user-products
    user_product_features = manager.get_features(
        spine_df=user_products,
        feature_views=[user_config, product_config, user_product_config]
    )
    
    # Load model and predict (placeholder)
    if model_path:
        # In a real implementation, you would load your trained model
        # and make predictions here
        pass
    
    # For demo, just return products with highest reorder rate
    top_products = user_product_features.sort(F.col("PRODUCT_REORDER_RATE").desc()).limit(10)
    
    return top_products