In [76]:
# make external scripts auto reload
%load_ext autoreload
%autoreload 2

In [330]:
import kaggle
import time
import pandas as pd
import numpy as np

from experiment_template import customer_hex_id_to_int
from candidate_generation import *

In [325]:
COMPETITION_NAME = 'h-and-m-personalized-fashion-recommendations'

def submit(filepath):
    kaggle.api.competition_submit(filepath, '', COMPETITION_NAME)
    res = None
    while res is None:
        time.sleep(1)
        res = get_results()[0]
        if res[1] is None:
            res = None

    return float(res[1]), float(res[2])

def get_results():
    return [
        (sub['fileNameNullable'], sub['publicScoreNullable'], sub['privateScoreNullable']) for sub in kaggle.api.competitions_submissions_list(COMPETITION_NAME)
    ]

def create_uniform_prediction(customers, items):
    return pd.merge(pd.Series(customers, name='customer_id'), pd.Series([items], name='prediction'), how='cross')

def hex_id_to_int(str):
    return int(str[-16:], 16)

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def article_id_str_to_int(series):
    return series.astype('int32')

def submission_to_df(path):
    sub = pd.read_csv(path)

    def pc(str):
        return int(str, 16)

    def pp(str):
        return list(map(int, str.split()))

    sub.customer_id = sub.customer_id.str[-16:].apply(pc)
    sub.prediction = sub.prediction.apply(pp)

    return sub

def create_submission(predictions, sample_submission):
    predictions = predictions.set_index("customer_id").prediction.to_dict()
    preds = []
    result = sample_submission.copy()
    for customer_id in customer_hex_id_to_int(result.customer_id):
        if customer_id not in predictions:
            preds.append(" ".join(["0"] * 12))
        else:
            preds.append(" ".join(f"0{x}" for x in predictions[customer_id]))
    result.prediction = preds
    return result

def save_and_submit(predictions, name=None):
    if name is None:
        name = 'probing'
        
    sub = create_submission(predictions, sample_submission)
    sub.to_csv(BASE_PATH + f'{name}.csv.gz', index=False)
    return submit(BASE_PATH + f'{name}.csv.gz')

def candidates_to_predictions(candidates):
    c = candidates.drop(columns='week').drop_duplicates(['customer_id', 'article_id'])
    return c.groupby('customer_id').head(12).groupby('customer_id', as_index=False).article_id.apply(list).rename(columns={'article_id':'prediction'})

In [216]:
BASE_PATH = '../../../data/'
# DATA_PATH = BASE_PATH + 'sample_0.05/'
DATA_PATH = BASE_PATH + 'parquet/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(DATA_PATH + 'transactions_train.parquet')
customers = pd.read_parquet(DATA_PATH + 'customers.parquet')
articles = pd.read_parquet(DATA_PATH + 'articles.parquet')
sample_submission = pd.read_csv(BASE_PATH + 'original/sample_submission.csv')

transactions = add_relative_week(transactions, 105)

In [239]:
active_users = transactions[transactions.week > 104 - 10].customer_id.unique()
cold_users = list(set(customers.customer_id) - set(active_users))
num_users = len(customers)
num_active_users = len(active_users)

print(f'active users: {num_active_users} ({num_active_users/num_users:.2%} of all users)')

active users: 437365 (31.88% of all users)


In [225]:
top12_articles

array([924243001, 918522001, 924243002, 923758001, 866731001, 915529003,
       909370001, 915529005, 751471001, 918292001, 762846027, 448509014],
      dtype=int32)

In [257]:
# top12_articles = transactions[transactions.week==104].article_id.value_counts().head(12).index.values
top12_articles = transactions[transactions.week==104].drop_duplicates(['customer_id', 'article_id']).article_id.value_counts().head(12).index.values
predictions = create_uniform_prediction(active_users, top12_articles)
save_and_submit(predictions)

In [342]:
predictions = submission_to_df(BASE_PATH + 'sub05-12f.csv.gz')
predictions = predictions.drop(index=predictions[~predictions.customer_id.isin(cold_users)].index)
save_and_submit(predictions)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.0M/62.0M [00:22<00:00, 2.92MB/s]


(0.00278, 0.0029)

In [340]:
active_customers = pd.Series(cold_users, name='customer_id')
bask = baskets(None, 105, active_customers, True)
c = pd.concat([
    candidates_repurchase(bask, transactions, 5, False),
    candidates_popularity(bask, transactions, 12, 1)
]).drop(columns='week').drop_duplicates(['customer_id', 'article_id'])
predictions = c.groupby('customer_id').head(12).groupby('customer_id', as_index=False).article_id.apply(list).rename(columns={'article_id':'prediction'})
save_and_submit(predictions)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55.6M/55.6M [00:21<00:00, 2.66MB/s]


(0.01958, 0.01907)

In [339]:
bask = baskets(None, 105, cold_users, True)
training_data = transactions[transactions.week > 104-10]
c = pd.concat([
    # candidates_repurchase(bask, transactions, 4, False),
    candidates_article_feature(bask, transactions, articles, 'prod_name', 6, 1, 2, 6, True),
    # candidates_article_feature(bask, transactions, articles, 'department_name', 6, 1, 2, 3, True),
    # candidates_article_feature(bask, transactions, articles, 'colour_group_name', 6, 1, 2, 3, True),
    # candidates_customer_feature(bask, training_data, customers, "age", 6, 1),
    # candidates_customer_feature(bask, training_data, customers, "postal_code", 6, 1),
    candidates_popularity(bask, transactions, 12, 1)
])
predictions = candidates_to_predictions(c)
save_and_submit(predictions)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62.0M/62.0M [00:20<00:00, 3.15MB/s]


(0.00278, 0.0029)

In [316]:
for test_week in range(80, 105):
    ac = set(transactions[(transactions.week > test_week - 11) & (transactions.week < test_week)].customer_id.unique())
    cc = set(customers.customer_id) - ac
    tc = set(transactions[transactions.week == test_week].customer_id.unique())
    
    print(test_week, len(ac & tc) / len(cc & tc))

80 1.3688211665504764
81 1.3169539799130565
82 1.3882005899705014
83 1.4423333333333332
84 1.5796911274856162
85 1.5861463015937882
86 1.4963843521817348
87 1.3839402887139107
88 1.4631043979444083
89 1.682159640517271
90 1.776995473812774
91 1.6862302483069977
92 1.8294872420354198
93 2.176618945182108
94 2.3193273640532865
95 2.391275727388423
96 2.4031924327519953
97 2.442338072669826
98 2.4129800237161363
99 2.6057209514632045
100 2.456573896353167
101 2.2783088235294118
102 2.136510300322661
103 2.0630741748894184
104 1.9904629790185537
