In [1]:
# make external scripts auto reload
%load_ext autoreload
%autoreload 2

In [4]:
import kaggle
import time
import pandas as pd
import numpy as np

from template.experiment_template import customer_hex_id_to_int
from candidate_generation import *

## Load data and utils

In [19]:
COMPETITION_NAME = 'h-and-m-personalized-fashion-recommendations'

def submit(filepath):
    kaggle.api.competition_submit(filepath, '', COMPETITION_NAME)
    res = None
    while res is None:
        time.sleep(1)
        res = get_results()[0]
        if res[1] is None:
            res = None

    return float(res[1]), float(res[2])

def get_results():
    return [
        (sub['fileNameNullable'], sub['publicScoreNullable'], sub['privateScoreNullable']) for sub in kaggle.api.competitions_submissions_list(COMPETITION_NAME)
    ]

def create_uniform_prediction(customers, items):
    return pd.merge(pd.Series(customers, name='customer_id'), pd.Series([items], name='prediction'), how='cross')

def hex_id_to_int(str):
    return int(str[-16:], 16)

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def article_id_str_to_int(series):
    return series.astype('int32')

def submission_to_df(path):
    sub = pd.read_csv(path)

    def pc(str):
        return int(str, 16)

    def pp(str):
        return list(map(int, str.split()))

    sub.customer_id = sub.customer_id.str[-16:].apply(pc)
    sub.prediction = sub.prediction.apply(pp)

    return sub

def create_submission(predictions, sample_submission):
    predictions = predictions.set_index("customer_id").prediction.to_dict()
    preds = []
    result = sample_submission.copy()
    for customer_id in customer_hex_id_to_int(result.customer_id):
        if customer_id not in predictions:
            preds.append(" ".join(["0"] * 12))
        else:
            preds.append(" ".join(f"0{x}" for x in predictions[customer_id]))
    result.prediction = preds
    return result

def save_and_submit(predictions, name=None):
    if name is None:
        name = 'probing'
        
    sub = create_submission(predictions, sample_submission)
    sub.to_csv(BASE_PATH + f'{name}.csv.gz', index=False)
    return submit(BASE_PATH + f'{name}.csv.gz')

def candidates_to_predictions(candidates):
    c = candidates.drop(columns='week').drop_duplicates(['customer_id', 'article_id'])
    return c.groupby('customer_id').head(12).groupby('customer_id', as_index=False).article_id.apply(list).rename(columns={'article_id':'prediction'})

def do_probe_cold(predictions):
    print("All", save_and_submit(predictions))
    
    predictions_subset = predictions.drop(index=predictions[~predictions.customer_id.isin(cold_users)].index)
    print("Active", save_and_submit(predictions_subset))
    
    predictions_subset = predictions.drop(index=predictions[predictions.customer_id.isin(cold_users)].index)
    print("Cold", save_and_submit(predictions_subset))
    

In [8]:
BASE_PATH = '../../data/'
# DATA_PATH = BASE_PATH + 'sample_0.05/'
DATA_PATH = BASE_PATH + 'parquet/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'original/sample_submission.csv')

transactions = add_relative_week(transactions)

In [9]:
active_users = transactions[transactions.week > 104 - 10].customer_id.unique()
cold_users = list(set(customers.customer_id) - set(active_users))
num_users = len(customers)
num_active_users = len(active_users)

print(f'active users: {num_active_users} ({num_active_users/num_users:.2%} of all users)')

active users: 437365 (31.88% of all users)


## Experiments

Approach | All | Active | Cold
---|---|---|---
top-12 last week | 0.00784 | 0.00520 | 0.00228
repurchase 4 weeks + top-12 last week | 0.02203 | 0.01974 | 0.00228
my first model | 0.02230 | 0.02064 | 0.00208

In [22]:
# top-12 last week
top12_articles = transactions[transactions.week==104].drop_duplicates(['customer_id', 'article_id']).article_id.value_counts().head(12).index.values
predictions = create_uniform_prediction(active_users, top12_articles)

do_probe_cold(predictions)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49.9M/49.9M [00:18<00:00, 2.76MB/s]


All (0.0052, 0.00496)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48.8M/48.8M [00:19<00:00, 2.56MB/s]


Active (0.0, 0.0)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49.9M/49.9M [00:17<00:00, 3.08MB/s]


Cold (0.0052, 0.00496)


In [20]:
bask = baskets(None, 105, transactions.customer_id.unique(), True)
c = pd.concat([
    candidates_repurchase(bask, transactions, 4, False),
    candidates_popularity(bask, transactions, 12, 1)
])
predictions = candidates_to_predictions(c)

do_probe_cold(predictions)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55.3M/55.3M [00:18<00:00, 3.10MB/s]


All (0.02083, 0.02086)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50.4M/50.4M [00:19<00:00, 2.76MB/s]


Active (0.0019, 0.00193)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54.6M/54.6M [00:19<00:00, 2.96MB/s]


Cold (0.01893, 0.01893)


In [21]:
# first model
predictions = submission_to_df(BASE_PATH + 'sub05-12f.csv.gz') 

do_probe_cold(predictions)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 76.8M/76.8M [00:27<00:00, 2.97MB/s]


All (0.02343, 0.02297)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.0M/62.0M [00:21<00:00, 3.07MB/s]


Active (0.00278, 0.0029)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64.7M/64.7M [00:21<00:00, 3.11MB/s]


Cold (0.02064, 0.02007)


## Finding better predictions for cold customers

Method | public | private
---|---|---
repurchase (distant history) | 0.00189 | 0.00191
IPop prod_name | 0.00263 | 0.00276
IPop department_name | 0.00175 | 0.00187
IPop colour_group_name | 0.00159 | 0.0017
UPop age | 0.0025 | 0.00239
UPop postal_code | 0.00204 | 0.00233

In [15]:
bask = baskets(None, 105, cold_users, True)
c = pd.concat([
    # candidates_repurchase(bask, transactions, 3, True),
    # candidates_article_feature(bask, transactions, articles, 'prod_name', 6, 1, 2, 6, True),
    # candidates_article_feature(bask, transactions, articles, 'department_name', 6, 1, 2, 3, True),
    # candidates_article_feature(bask, transactions, articles, 'colour_group_name', 6, 1, 2, 3, True),
    # candidates_customer_feature(bask, transactions, customers, "age", 12, 1),
    candidates_customer_feature(bask, transactions, customers, "postal_code", 12, 1),
    candidates_popularity(bask, transactions, 12, 1)
])
predictions = candidates_to_predictions(c)
save_and_submit(predictions)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 54.6M/54.6M [00:19<00:00, 2.94MB/s]


(0.00204, 0.00233)