In [1]:
import numpy as np
import pandas as pd
import treelib
from pathlib import Path
from treelib import Node, Tree

DATA_DIR = Path('../../data/retail-rocket')
EXPORT_DIR = Path('../../data/retail-rocket') / 'saved'

PATH_CATEGORY_TREE = DATA_DIR / 'category_tree.csv'
PATH_EVENTS = DATA_DIR /'events.csv'
PATH_ITEM_PROPS1 = DATA_DIR / 'item_properties_part1.csv'
PATH_ITEM_PROPS2 = DATA_DIR / 'item_properties_part2.csv'

# Creating Category Features via Tree

The category tree provided is given as a table of edges. We want to be able to get all the levels given a leaf node.

In [2]:
cat_tree_df = pd.read_csv(PATH_CATEGORY_TREE)

In [3]:
cat_tree_df.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [4]:
tree = Tree()
ROOT = 'cat_tree'
tree.create_node(identifier=ROOT)
tree.create_node(identifier=-1, parent=ROOT) # temp
for _, row in cat_tree_df.iterrows():
    categoryid, parentid = row
    if np.isnan(parentid):
        parentid = ROOT
    else:
        parentid = int(parentid)
    categoryid = int(categoryid)
    
    if not tree.contains(parentid):
        tree.create_node(identifier=parentid, parent=-1)
        
    if not tree.contains(categoryid):
        tree.create_node(identifier=categoryid, parent=parentid)
    else:
        if tree.get_node(categoryid).bpointer == -1:
            tree.move_node(categoryid, parentid)
tree.link_past_node(-1)

In [5]:
# Print the tree structure
# tree.show(line_type='ascii-em')

# Item Properties

We are provided with a bunch of item properties that can possibly change over time. But we will only be working with `categoryid` (and the latest record of it).

In [6]:
item_props_df = pd.concat([
    pd.read_csv(PATH_ITEM_PROPS1, usecols=['itemid', 'property', 'value']),
    pd.read_csv(PATH_ITEM_PROPS2, usecols=['itemid', 'property', 'value']),
])

In [7]:
item_props_df = item_props_df.loc[item_props_df['property']=='categoryid']\
    .drop_duplicates().drop('property', axis=1).set_index('itemid')
item_props_df.columns = ['categoryid']
item_props_df['categoryid'] = item_props_df['categoryid'].astype(np.uint16)

In [8]:
# Could memoize if we wanted, meh
def get_cats(categoryid):
    try:
        return list(tree.rsearch(categoryid))[::-1][1:]
    except treelib.exceptions.NodeIDAbsentError:
        return []

In [9]:
item_categories_df = pd.DataFrame(item_props_df['categoryid'].map(get_cats).tolist())
item_categories_df.columns = [f'categoryid_lvl{i}' for i in range(6)]
item_categories_df.index = item_props_df.index
item_categories_df.reset_index(inplace=True)

In [10]:
# lvl3-5 are mostly NaN, probably want to chop them off
item_categories_df.to_msgpack(EXPORT_DIR / 'item_categories.msg')

In [11]:
item_categories_df.head()

Unnamed: 0,itemid,categoryid_lvl0,categoryid_lvl1,categoryid_lvl2,categoryid_lvl3,categoryid_lvl4,categoryid_lvl5
0,460429,395.0,1278.0,1338.0,,,
1,281245,653.0,312.0,1277.0,,,
2,35575,378.0,1696.0,1059.0,,,
3,8313,250.0,1027.0,1147.0,,,
4,55102,1482.0,381.0,47.0,,,


# Events

Pre-split our event facts.

In [12]:
HOLDOUT_DATE = '2015-09-01'

In [13]:
events_df = pd.read_csv(PATH_EVENTS, usecols=['timestamp', 'visitorid', 'event', 'itemid'])
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], unit='ms')

In [14]:
events_df.to_msgpack(EXPORT_DIR / 'events.msg')

In [15]:
events_df.loc[events_df['timestamp'] < HOLDOUT_DATE].to_msgpack(EXPORT_DIR / 'events_tsplit.msg')
events_df.loc[events_df['timestamp'] >= HOLDOUT_DATE].to_msgpack(EXPORT_DIR / 'events_vsplit.msg')