# Preprocess Publiq
This proprietary dataset is already preprocessed and only needs to be split

In [1]:
import teaser.util as util
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
DATA_DIR = Path('../../data/Publiq/raw')
INTERACTIONS_PATH = DATA_DIR / 'interactions.csv'
METADATA_PATH = DATA_DIR / 'features.csv'

# parse ids
ITEM_ID = 'eventId'
USER_ID = 'sessionId'

# For output
T_ITEM_ID = 'itemId'
T_USER_ID = 'userId'

OUTPUT_DIR = Path('../../data/Publiq/')
OUTPUT_INTERACTIONS_PATH = OUTPUT_DIR / 'interactions.csv'
OUTPUT_FEATURES_PATH = OUTPUT_DIR / 'features.csv'

# For preprocessing
# MIN_USERS_PER_ITEM = 5
# MIN_ITEMS_PER_USER = 5

# For split
AMT_VAL_USERS = 1000
AMT_TEST_USERS = 1500
PERC_HISTORY = 0.8
VAL_MIN_ITEMS_PER_USER = 5   # Make sure only users with at least this many items are selected for validation

# For reproducibility
SEED = 42

## Load data

In [3]:
X = util.parse_interactions(INTERACTIONS_PATH, item_id=ITEM_ID, user_id=USER_ID)
X.shape

(452395, 15000)

In [4]:
hist_lengths = np.asarray(X.sum(axis=1)).flatten()
(hist_lengths >= 5).sum()

3781

## Only keep users with more than one interaction

In [5]:
X = X[hist_lengths > 1]
X.shape

(46075, 15000)

## Print statistics

In [6]:
amt_users, amt_items = X.shape
i_density = X.getnnz() / (amt_users * amt_items)

S, tags = util.parse_metadata(METADATA_PATH, T_ITEM_ID)
metadata_names = set((tag.split('_')[0] for tag in tags))
_, amt_features = S.shape
f_density = S.getnnz() / (amt_features * amt_items)

print(f"# users: {amt_users}")
print(f"# items: {amt_items}")
print(f"inter. density: {i_density:.3%}")
print(f"columns: {', '.join(metadata_names)}")
print(f"# features: {amt_features}")
print(f"feature density: {f_density:.3%}")

# users: 46075
# items: 15000
inter. density: 0.019%
columns: targetaudience, postalCode, price, eventtype, dayType, theme, region, label, minimumage
# features: 2158
feature density: 0.578%


## Split interactions with strong generalization scheme

In [7]:
Xtrainval, Xtest_in, Xtest_out = util.split(X, test_users=AMT_TEST_USERS, perc_history=PERC_HISTORY, min_interactions=VAL_MIN_ITEMS_PER_USER, seed=SEED)
Xtrain, Xval_in, Xval_out = util.split(Xtrainval, test_users=AMT_VAL_USERS, perc_history=PERC_HISTORY, min_interactions=VAL_MIN_ITEMS_PER_USER, seed=SEED)

## Save files

In [8]:
basename = OUTPUT_INTERACTIONS_PATH.stem

util.store_interactions(Xtrain, OUTPUT_DIR / f"{basename}.train.csv", item_id=T_ITEM_ID, user_id=T_USER_ID)
util.store_interactions(Xtest_in, OUTPUT_DIR / f"{basename}.testin.csv", item_id=T_ITEM_ID, user_id=T_USER_ID)
util.store_interactions(Xtest_out, OUTPUT_DIR / f"{basename}.testout.csv", item_id=T_ITEM_ID, user_id=T_USER_ID)
util.store_interactions(Xval_in, OUTPUT_DIR / f"{basename}.valin.csv", item_id=T_ITEM_ID, user_id=T_USER_ID)
util.store_interactions(Xval_out, OUTPUT_DIR / f"{basename}.valout.csv", item_id=T_ITEM_ID, user_id=T_USER_ID)

pd.read_csv(METADATA_PATH).to_csv(OUTPUT_FEATURES_PATH, index=False)