In [25]:
import argparse
import os
import timeit
import time
from pprint import pprint
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from datetime import datetime
from xgboost.sklearn import XGBClassifier
from feast import FeatureStore

In [5]:
raw_data = pd.read_parquet("/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/user_source.parquet")
raw_data.head()

Unnamed: 0,user_id,product_id,rating,event_timestamp,user_age,user_location
0,0,1647,4,2024-06-28 08:28:51,22,Busan
1,0,1505,5,2024-07-03 09:46:09,30,Busan
2,0,1608,5,2024-04-21 08:05:01,25,Seoul
3,0,1785,4,2024-01-13 04:16:34,25,Gwangju
4,0,1505,5,2024-02-22 13:48:20,22,Daegu


In [10]:
raw_data["user_id"].value_counts()

user_id
3395    28
5231    28
1043    28
6911    28
2565    28
        ..
6742     1
3963     1
1775     1
496      1
4586     1
Name: count, Length: 7071, dtype: int64

## Deploy local feature store.

In [20]:
!cd /home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc07/feature_repo && feast apply

Updated feature view [1m[33muser_product_rating_fv[0m
	batch_source: [1m[33mtype: BATCH_FILE
timestamp_field: "event_timestamp"
created_timestamp_column: "created"
file_options {
  uri: "/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/user_source.parquet"
}
data_source_class_type: "feast.infra.offline_stores.file_source.FileSource"
name: "user_source"
[0m -> [1m[92mtype: BATCH_FILE
timestamp_field: "event_timestamp"
file_options {
  uri: "/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/user_source.parquet"
}
data_source_class_type: "feast.infra.offline_stores.file_source.FileSource"
name: "user_source"
[0m

[1m[94mNo changes to infrastructure


## Generate training data

In [21]:
store = FeatureStore(repo_path="/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc07/feature_repo")

In [22]:
entity_df = pd.DataFrame.from_dict(
    {
        "user_id": [52, 93, 106, 435, 496, 533, 577, 798, 829, 1137, 1147, 1185, 1261, 1270, 1412, 1512, 1775, 1783, 2043, 2100, 2666, 2726, 2926, 3139, 3181, 3436, 3458, 3478, 3674, 3714, 3792, 3963, 4002, 4016, 4086, 4087, 4136, 4333, 4446, 4508, 4532, 4536, 4549, 4586, 4803, 4871, 4928, 4953, 5282, 5640, 5765, 5776, 5815, 6129, 6261, 6392, 6742, 6869, 6909, 6933],
        "event_timestamp":[
        datetime(2024, 4, 12, 12, 59, 41), datetime(2024, 6, 5, 3, 57, 52), datetime(2024, 6, 7, 20, 14, 55),
        datetime(2024, 3, 24, 2, 49, 18), datetime(2024, 1, 9, 2, 57, 16), datetime(2024, 4, 11, 10, 49, 19),
        datetime(2024, 3, 18, 2, 30, 43), datetime(2024, 5, 21, 8, 52, 17), datetime(2024, 1, 31, 23, 3, 21),
        datetime(2024, 3, 8, 12, 29, 24), datetime(2024, 6, 4, 12, 46, 43), datetime(2024, 6, 1, 14, 24, 44),
        datetime(2024, 5, 4, 9, 23, 57), datetime(2024, 2, 22, 2, 13, 34), datetime(2024, 1, 18, 20, 59, 42),
        datetime(2024, 1, 28, 17, 14, 57), datetime(2024, 6, 28, 3, 18, 25), datetime(2024, 4, 12, 12, 8, 20),
        datetime(2024, 5, 29, 20, 47, 49), datetime(2024, 2, 1, 19, 48, 36), datetime(2024, 5, 19, 18, 2, 23),
        datetime(2024, 1, 13, 22, 1, 33), datetime(2024, 3, 4, 0, 39, 26), datetime(2024, 6, 25, 21, 12, 28),
        datetime(2024, 4, 15, 17, 59, 3), datetime(2024, 6, 7, 0, 31, 5), datetime(2024, 2, 13, 15, 24, 2),
        datetime(2024, 2, 24, 10, 33, 19), datetime(2024, 3, 14, 23, 27, 8), datetime(2024, 4, 12, 7, 15, 18),
        datetime(2024, 6, 28, 21, 13, 19), datetime(2024, 1, 21, 9, 10, 46), datetime(2024, 3, 30, 9, 17, 44),
        datetime(2024, 5, 28, 0, 32, 11), datetime(2024, 1, 28, 6, 13, 56), datetime(2024, 1, 28, 21, 39, 18),
        datetime(2024, 6, 15, 18, 18, 51), datetime(2024, 4, 9, 10, 57, 30), datetime(2024, 7, 4, 11, 23, 35),
        datetime(2024, 5, 24, 16, 3, 37), datetime(2024, 3, 1, 6, 57, 55), datetime(2024, 5, 15, 22, 10, 40),
        datetime(2024, 4, 12, 0, 28, 20), datetime(2024, 4, 4, 16, 29, 30), datetime(2024, 1, 8, 2, 51, 53),
        datetime(2024, 5, 17, 10, 8, 41), datetime(2024, 4, 3, 8, 45, 35), datetime(2024, 4, 6, 8, 43, 12),
        datetime(2024, 3, 28, 1, 53, 59), datetime(2024, 1, 7, 8, 49, 11), datetime(2024, 3, 9, 19, 57, 30),
        datetime(2024, 6, 1, 21, 1, 7), datetime(2024, 1, 5, 3, 15, 2), datetime(2024, 1, 26, 9, 25, 39),
        datetime(2024, 5, 2, 7, 42, 39), datetime(2024, 4, 19, 11, 21, 56), datetime(2024, 4, 5, 0, 12, 14),
        datetime(2024, 1, 8, 2, 37, 52), datetime(2024, 4, 25, 4, 30, 37), datetime(2024, 1, 2, 7, 3, 0)
    ],
        "recent_viewed_product_id": np.random.randint(1, 1000, size=60),  
        "cart_size": np.random.randint(1, 10, size=60)  
    }
)

training_df = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "user_product_rating_fv:product_id",
        "user_product_rating_fv:rating",
        "user_product_rating_fv:user_age",
        "user_product_rating_fv:user_location",
    ],
).to_df()



In [28]:
training_df["user_id"].to_list()

[6933,
 5815,
 5640,
 6869,
 4803,
 496,
 2726,
 1412,
 3963,
 6129,
 4086,
 1512,
 4087,
 829,
 2100,
 3458,
 1270,
 3478,
 4532,
 2926,
 1137,
 5765,
 3674,
 577,
 435,
 5282,
 4002,
 4928,
 4586,
 6742,
 4953,
 4333,
 533,
 4549,
 3714,
 1783,
 52,
 3181,
 6392,
 6909,
 6261,
 1261,
 4536,
 4871,
 2666,
 798,
 4508,
 4016,
 2043,
 1185,
 5776,
 1147,
 93,
 3436,
 106,
 4136,
 3139,
 1775,
 3792,
 4446]

## Ingest batch features into your online store

In [24]:
!cd /home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc07/feature_repo && feast materialize 2024-05-01T00:00:00+00:00 2024-05-31T00:00:00+00:00

Materializing [1m[32m1[0m feature views from [1m[32m2024-05-01 09:00:00+09:00[0m to [1m[32m2024-05-31 09:00:00+09:00[0m into the [1m[32msqlite[0m online store.

[1m[32muser_product_rating_fv[0m:
100%|████████████████████████████████████████████████████████| 6372/6372 [00:00<00:00, 13053.10it/s]


In [30]:
start = timeit.default_timer()

feature_vector = store.get_online_features(
    features=[
         "user_product_rating_fv:product_id",
        "user_product_rating_fv:rating",
        "user_product_rating_fv:user_age",
        "user_product_rating_fv:user_location",
    ],
    entity_rows=[
        {"user_id":6933},
        {"user_id":5815},
        {"user_id":6869},
        {"user_id":4803},
        {"user_id":496},
        {"user_id":2726},
        {"user_id":1412},
        {"user_id":3963},
        {"user_id":6129},   
    ]
).to_dict()
end = timeit.default_timer()
pre_process_time = end - start
print('generating inference data time:\t', pre_process_time)



generating inference data time:	 0.0027800695970654488
