In [2]:
import argparse
import os
import timeit

import joblib
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from xgboost.sklearn import XGBClassifier

In [3]:
department_columns = [
    "FINANCIAL SERVICES", "SHOES", "PERSONAL CARE", "PAINT AND ACCESSORIES", "DSD GROCERY", "MEAT - FRESH & FROZEN",
    "DAIRY", "PETS AND SUPPLIES", "HOUSEHOLD CHEMICALS/SUPP", "IMPULSE MERCHANDISE", "PRODUCE",
    "CANDY, TOBACCO, COOKIES", "GROCERY DRY GOODS", "BOYS WEAR", "FABRICS AND CRAFTS", "JEWELRY AND SUNGLASSES",
    "MENS WEAR", "ACCESSORIES", "HOME MANAGEMENT", "FROZEN FOODS", "SERVICE DELI", "INFANT CONSUMABLE HARDLINES",
    "PRE PACKED DELI", "COOK AND DINE", "PHARMACY OTC", "LADIESWEAR", "COMM BREAD", "BAKERY", "HOUSEHOLD PAPER GOODS",
    "CELEBRATION", "HARDWARE", "BEAUTY", "AUTOMOTIVE", "BOOKS AND MAGAZINES", "SEAFOOD", "OFFICE SUPPLIES",
    "LAWN AND GARDEN", "SHEER HOSIERY", "WIRELESS", "BEDDING", "BATH AND SHOWER", "HORTICULTURE AND ACCESS",
    "HOME DECOR", "TOYS", "INFANT APPAREL", "LADIES SOCKS", "PLUS AND MATERNITY", "ELECTRONICS",
    "GIRLS WEAR, 4-6X  AND 7-14", "BRAS & SHAPEWEAR", "LIQUOR,WINE,BEER", "SLEEPWEAR/FOUNDATIONS",
    "CAMERAS AND SUPPLIES", "SPORTING GOODS", "PLAYERS AND ELECTRONICS", "PHARMACY RX", "MENSWEAR", "OPTICAL - FRAMES",
    "SWIMWEAR/OUTERWEAR", "OTHER DEPARTMENTS", "MEDIA AND GAMING", "FURNITURE", "OPTICAL - LENSES", "SEASONAL",
    "LARGE HOUSEHOLD GOODS", "1-HR PHOTO", "CONCEPT STORES", "HEALTH AND BEAUTY AIDS"
]
weekday_columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

featureColumns = ['scan_count', 'scan_count_abs'] + weekday_columns + department_columns

label_column = 'trip_type'

# deleted label 14, since only 4 samples existed in the sample data set
label_range = [3, 4, 5, 6, 7, 8, 9, 12, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
               37, 38, 39, 40, 41, 42, 43, 44, 999]
sorted_labels = sorted(label_range, key=str)
label_to_index = {k: v for v, k in enumerate(sorted_labels)}

In [4]:
# Dataset Path
order_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/order.csv"
lineitem_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/lineitem.csv"
product_path = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/data/training/product.csv"


# Output Path
work_dir = r"/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/model/uc08"
output = work_dir

## Data Acquisition - Load Data

In [5]:
def load_data(order_path: str, lineitem_path: str, product_path: str) -> pd.DataFrame:
    order_data = pd.read_csv(order_path, parse_dates=['date'])
    lineitem_data = pd.read_csv(lineitem_path)
    product_data = pd.read_csv(product_path)
    data = order_data.merge(lineitem_data, left_on='o_order_id', right_on='li_order_id')
    data = data.merge(product_data, left_on='li_product_id', right_on='p_product_id')

    if 'trip_type' in data.columns:
        return data[['o_order_id', 'date', 'department', 'quantity', 'trip_type']]
    else:
        return data[['o_order_id', 'date', 'department', 'quantity']]

In [6]:
start = timeit.default_timer()
raw_data = load_data(order_path, lineitem_path, product_path)
end = timeit.default_timer()
load_time = end - start
print('load time:\t', load_time)

load time:	 8.572427896782756


In [12]:
raw_data.to_csv("raw_data.csv")


In [16]:
print(raw_data.iloc[:5])

   o_order_id       date    department  quantity  trip_type
0           1 2010-07-14  SERVICE DELI         2          8
1          15 2011-11-19  SERVICE DELI         2          8
2          15 2011-11-19  SERVICE DELI         1          8
3          15 2011-11-19  SERVICE DELI         2          8
4          36 2011-10-12  SERVICE DELI         3          8


## Data Acquisition - Preprocess Data

In [7]:
def encode_label(label):
    return label_to_index[label]

def pre_process(raw_data: pd.DataFrame) -> (np.array, pd.DataFrame):
    # check if this data needs labeling(has trip type label)
    has_labels = label_column in raw_data.columns

    def scan_count(x):
        return np.sum(x)

    def scan_count_abs(x):
        return np.sum(np.abs(x))

    def weekday(x):
        return np.min(x)

    def trip_type(x):
        return np.min(x)

    if has_labels:
        agg_func = {
            'scan_count': [scan_count, scan_count_abs],
            'weekday': weekday,
            'trip_type': trip_type
        }
    else:
        agg_func = {
            'scan_count': [scan_count, scan_count_abs],
            'weekday': weekday
        }

    raw_data['scan_count'] = raw_data['quantity']
    raw_data['weekday'] = raw_data['date'].dt.day_name()
    features_scan_count: pd.DataFrame = raw_data.groupby(['o_order_id']).agg(agg_func)

    features_scan_count.columns = features_scan_count.columns.droplevel(0)

    def grper(x):
        return int(pd.Series.count(x) > 0)

    weekdays = raw_data.pivot_table(index='o_order_id', columns='weekday', values='scan_count',
                                    aggfunc=grper).fillna(0.0)

    missing_weekdays = set(weekday_columns) - set(weekdays.columns)
    for c in missing_weekdays:
        weekdays.insert(1, c, 0.0)

    departments = raw_data.pivot_table(index='o_order_id', columns='department', values='scan_count',
                                       aggfunc='sum')

    missing_cols = set(department_columns) - set(departments.columns)
    for c in missing_cols:
        departments.insert(1, c, 0.0)

    final_data: pd.DataFrame = features_scan_count.drop(columns=['weekday']) \
        .join(weekdays) \
        .join(departments) \
        .fillna(0.0)

    if label_column in final_data.columns:
        # remove tiny classes
        final_data = final_data[final_data['trip_type'] != 14]
        final_data[label_column] = final_data['trip_type'].apply(encode_label)
        return final_data[label_column].values.ravel(), final_data[featureColumns]
    else:
        return None, final_data[featureColumns]

In [8]:
start = timeit.default_timer()
(labels, data) = pre_process(raw_data)
end = timeit.default_timer()
pre_process_time = end - start
print('pre-process time:\t', pre_process_time)

pre-process time:	 697.7915676683187


In [9]:
data.to_csv(".data.csv")

## Training

In [10]:
def train(training_data: pd.DataFrame, labels, num_rounds):
    xgboost_clf = XGBClassifier(tree_method='hist', objective='multi:softprob', n_estimators=num_rounds)

    features = csr_matrix(training_data[featureColumns])
    model = xgboost_clf.fit(features, labels)
    return model


In [11]:
start = timeit.default_timer()
num_rounds = 100
model_file_name =  'uc08.python.model'
model = train(data, labels, num_rounds)
end = timeit.default_timer()
train_time = end - start
print('train time:\t', train_time)

model_file_name = 'uc08.python.model.run-u08-on-nb'
joblib.dump(model, work_dir + '/' + model_file_name)

train time:	 2433.700179751031


['/home/hjhwang/workspace/tpcx-ai-v1.0.3.1/output/model/uc08/uc08.python.model.run-u08-on-nb']

## Serving

In [13]:
def decode_label(label):
    return sorted_labels[label]

def serve(model, data: pd.DataFrame) -> pd.DataFrame:
    sparse_data = csr_matrix(data)
    predictions = model.predict(sparse_data)
    dec_fun = np.vectorize(decode_label)
    predictions_df = pd.DataFrame({'o_order_id': data.index, 'trip_type': dec_fun(predictions)})
    return predictions_df

In [14]:
model = joblib.load(work_dir + '/' + model_file_name)

start = timeit.default_timer()
predictions = serve(model, data)
end = timeit.default_timer()
serve_time = end - start

predictions['o_order_id'] = data.index
predictions.to_csv(output + f'/{model_file_name}_predictions.csv', index=False)

print('serve time:\t', serve_time)

serve time:	 78.48762742616236
