In [1]:
import argparse
import os
import timeit

import joblib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from xgboost.sklearn import XGBClassifier
from feast import FeatureStore

In [9]:
order_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/order.csv"
lineitem_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/lineitem.csv"
product_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/product.csv"


In [23]:
order_data = pd.read_csv(order_path, parse_dates=['date'])
lineitem_data = pd.read_csv(lineitem_path)
product_data = pd.read_csv(product_path)
product_data = product_data.rename(columns={"p_product_id":"product_id"})

In [24]:
product_data.head()


Unnamed: 0,product_id,name,department
0,1,AVgDcdD1za,FINANCIAL SERVICES
1,2,,FINANCIAL SERVICES
2,3,dGnp5zudx1L8kj,IMPULSE MERCHANDISE
3,4,BYY7U,FINANCIAL SERVICES
4,5,h,FINANCIAL SERVICES


## Generate Data
- Feature: lineitem(with date) + product
- Entity: Order

In [26]:
order_data_id_date = order_data[["o_order_id","date"]]
lineitem_data_with_date = lineitem_data.merge(order_data_id_date, left_on='li_order_id', right_on='o_order_id')
lineitem_data_with_date = lineitem_data_with_date.drop(columns=['o_order_id'])
lineitem_data_with_date= lineitem_data_with_date.rename(columns={'li_product_id': 'product_id',"li_order_id":"order_id"})
lineitem_data_with_date = lineitem_data_with_date.merge(product_data, left_on='product_id', right_on='product_id')
lineitem_data_with_date.head()


Unnamed: 0,order_id,product_id,quantity,price,date,name,department
0,1,94,2,2.91,2010-07-14,,PRODUCE
1,2,1,3,1.2,2010-09-14,AVgDcdD1za,FINANCIAL SERVICES
2,2,14,3,1.64,2010-09-14,PMgiH,FINANCIAL SERVICES
3,2,14,3,2.41,2010-09-14,PMgiH,FINANCIAL SERVICES
4,2,2,2,7.06,2010-09-14,,FINANCIAL SERVICES


In [27]:
lineitem_data_with_date.to_parquet("feature_li_pro.parquet")

In [32]:
feature_pq = feature_pq.dropna(subset= ["name","department"])
feature_pq.describe

<bound method NDFrame.describe of           order_id  product_id  quantity  price       date        name  \
1                2           1         3   1.20 2010-09-14  AVgDcdD1za   
2                2          14         3   1.64 2010-09-14       PMgiH   
3                2          14         3   2.41 2010-09-14       PMgiH   
5                2           8         2   2.55 2010-09-14    OMh0eCuz   
6                2           8         2   4.17 2010-09-14    OMh0eCuz   
...            ...         ...       ...    ...        ...         ...   
23026661   3676954         660         2   1.80 2010-12-23       yS2Ex   
23026662   3676954         655         3   6.13 2010-12-23   5ZGNrYGbI   
23026663   3676955         127         2   6.66 2011-11-28      yhLqA8   
23026664   3676955         123         5   5.17 2011-11-28       gDv3m   
23026665   3676955         115         2   6.21 2011-11-28   Tm2XCNBY0   

                   department  
1          FINANCIAL SERVICES  
2          FI

In [33]:

feature_pq["name"] = feature_pq["name"].astype(str)
feature_pq["department"]= feature_pq["department"].astype(str)
feature_pq.dtypes

order_id               int64
product_id             int64
quantity               int64
price                float64
date          datetime64[ns]
name                  object
department            object
dtype: object

In [34]:
feature_pq.to_parquet("feature_ineitem_order.parquet")

In [43]:
entity_pq = order_data 
order_data = order_data.rename(columns={"o_order_id":"order_id","date":"event_timestamp"})
order_data.dtypes
order_data.to_parquet("entity_order.parquet")

In [44]:
store = FeatureStore(repo_path="/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feast_uc08/feature_repo")

In [45]:
training_df = store.get_historical_features(
    entity_df=order_data,
    features=[
        "line_order_product:price",
    ],
).to_df()



In [46]:
training_df.head()


Unnamed: 0,order_id,o_customer_sk,weekday,event_timestamp,store,trip_type,price
0,1095370,47251,Monday,2010-01-04 00:00:00+00:00,6,5,5.03
1,924357,17972,Monday,2010-01-04 00:00:00+00:00,8,37,8.91
2,129378,56575,Monday,2010-01-04 00:00:00+00:00,5,999,7.69
3,718770,50135,Monday,2010-01-04 00:00:00+00:00,3,999,5.68
4,2294536,7386,Monday,2010-01-04 00:00:00+00:00,5,40,7.13


In [39]:
feature_df = pd.read_parquet("/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/feature_li_pro.parquet")
feature_df = feature_df.rename(columns={"date":"event_timestamp"})
feature_df.dtypes

order_id                    int64
product_id                  int64
quantity                    int64
price                     float64
event_timestamp    datetime64[ns]
name                       object
department                 object
dtype: object

In [40]:
feature_df.to_parquet("feature_li_pro.parquet")