In [1]:
import argparse
import os
import timeit
from pprint import pprint
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from xgboost.sklearn import XGBClassifier
from feast import FeatureStore

In [31]:
label_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/labels.parquet"
label_data = pd.read_parquet(label_path)

## Generating Training Data
- (1) Read data parquet file to pandas dataframe
- (2) Merge with label dataframe


In [33]:

train_label_data = label_data[(label_data['event_timestamp'] >= '2024-01-01') & (label_data['event_timestamp'] < '2024-05-01')]

In [34]:
start = timeit.default_timer()
data_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/data.parquet"
data = pd.read_parquet(data_path)
merged_data = pd.merge(data, train_label_data, on=['o_order_id','event_timestamp'], how='inner')[['o_order_id', 'trip_type', 'event_timestamp', 'scan_count',
       'scan_count_abs']]
end = timeit.default_timer()
pre_process_time = end - start
print('generating train data time:\t', pre_process_time)

generating train data time:	 2.106879997998476


In [35]:
merged_data.describe

<bound method NDFrame.describe of          o_order_id  trip_type                     event_timestamp  \
0                 3         33 2024-03-24 03:23:42.480527360+00:00   
1                 4         34 2024-01-31 21:04:35.499944448+00:00   
2                 5         24 2024-03-20 09:29:06.256833024+00:00   
3                 6          5 2024-02-20 23:07:58.610120704+00:00   
4                 7         26 2024-03-10 04:17:01.824889344+00:00   
...             ...        ...                                 ...   
2411603     3676948         31 2024-01-09 23:36:16.387957760+00:00   
2411604     3676950         34 2024-03-11 03:51:32.402438912+00:00   
2411605     3676951         15 2024-02-03 23:31:55.918401024+00:00   
2411606     3676954         29 2024-04-08 16:44:08.425506560+00:00   
2411607     3676955         35 2024-01-17 12:44:43.901174528+00:00   

         scan_count  scan_count_abs  
0                12              12  
1                48              48  
2          

## Generating Inference Data
- (1) Read data parquet file to pandas dataframe
- (2) Select rows in order id list


In [38]:
infer_label_data = label_data[(label_data['event_timestamp'] >= '2024-05-01') & (label_data['event_timestamp'] < '2024-05-31')]
data_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/data.parquet"
data = pd.read_parquet(data_path)
merged_data = pd.merge(data, infer_label_data, on=['o_order_id','event_timestamp'], how='inner')[['o_order_id', 'scan_count',
       'scan_count_abs']]

In [39]:
start = timeit.default_timer()

selected_ids = [10, 12, 26, 27, 32, 36, 50, 53, 55]
filtered_rows = merged_data[merged_data['o_order_id'].isin(selected_ids)] 
end = timeit.default_timer()
pre_process_time = end - start
print('generating inference data time:\t', pre_process_time)

generating inference data time:	 0.007800019346177578
