In [20]:
import argparse
import os
import timeit
from pprint import pprint
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from xgboost.sklearn import XGBClassifier
from feast import FeatureStore

## Offline Store Use Case 1: Generating Training Data
- Feature: lineitem(with date) + product
- Entity: Order

In [3]:
store = FeatureStore(repo_path="/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc08/feature_repo")
order_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/entity_order.parquet"
order_data = pd.read_parquet(order_path)

In [4]:
training_entity_df = order_data[:1000000]
training_entity_df.describe

<bound method NDFrame.describe of         order_id  o_customer_sk    weekday event_timestamp  store  trip_type
0              1          37731  Wednesday      2010-07-14     11          8
1              2          43252    Tuesday      2010-09-14      3          3
2              3          19823   Saturday      2010-09-18     10          7
3              4          34924  Wednesday      2010-08-11      4          8
4              5          42024  Wednesday      2011-04-06      4         39
...          ...            ...        ...             ...    ...        ...
999995    999996          56800    Tuesday      2010-03-16      8         39
999996    999997          65366  Wednesday      2011-08-31      5         28
999997    999998          36147    Tuesday      2011-05-17      2          9
999998    999999          14167  Wednesday      2011-11-16      1         39
999999   1000000          49495     Monday      2011-09-19      3         39

[1000000 rows x 6 columns]>

In [5]:
order_data["event_timestamp"].value_counts()

event_timestamp
2010-03-03    14712
2011-03-02    14706
2011-09-14    13662
2010-08-04    13529
2010-09-15    13515
              ...  
2011-09-25      824
2010-05-02      815
2011-06-05      815
2011-10-23      803
2010-10-10      802
Name: count, Length: 728, dtype: int64

In [6]:
training_df = store.get_historical_features(
    entity_df=training_entity_df,
    features=[
        "line_order_product:price",
    ],
).to_df()



In [46]:
training_df.head()


Unnamed: 0,order_id,o_customer_sk,weekday,event_timestamp,store,trip_type,price
0,1095370,47251,Monday,2010-01-04 00:00:00+00:00,6,5,5.03
1,924357,17972,Monday,2010-01-04 00:00:00+00:00,8,37,8.91
2,129378,56575,Monday,2010-01-04 00:00:00+00:00,5,999,7.69
3,718770,50135,Monday,2010-01-04 00:00:00+00:00,3,999,5.68
4,2294536,7386,Monday,2010-01-04 00:00:00+00:00,5,40,7.13


In [23]:
training_df["event_timestamp"].value_counts()

event_timestamp
2010-03-03 00:00:00+00:00    4034
2011-03-02 00:00:00+00:00    3950
2010-09-15 00:00:00+00:00    3728
2010-08-04 00:00:00+00:00    3691
2011-09-14 00:00:00+00:00    3684
                             ... 
2010-06-06 00:00:00+00:00     218
2011-07-10 00:00:00+00:00     216
2010-05-30 00:00:00+00:00     216
2011-10-23 00:00:00+00:00     214
2011-10-30 00:00:00+00:00     201
Name: count, Length: 728, dtype: int64

## Offline Store Use Case 2: Run Offline Inference
- Entity: 2010-10-10 에 발생한 구매 데이터

In [19]:
!cd /home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc08/feature_repo && feast materialize 2010-01-03T00:00:00+00:00 2010-01-05T00:00:00+00:00

Materializing [1m[32m1[0m feature views from [1m[32m2010-01-03 09:00:00+09:00[0m to [1m[32m2010-01-05 09:00:00+09:00[0m into the [1m[32msqlite[0m online store.

[1m[32mline_order_product[0m:
100%|█████████████████████████████████████████████████████████| 3980/3980 [00:00<00:00, 9585.87it/s]


In [23]:
feature_vector = store.get_online_features(
    features=[
        "line_order_product:price",
    ],
    entity_rows=[
        {"order_id":866815}
    ]
).to_dict()

print(feature_vector)



{'order_id': [866815], 'price': [6.36]}
