In [1]:
import os
os.environ["PYTHONIOENCODING"] = "utf8"
import sys

import pandas as pd
import numpy as np
from numpy import random as np_rnd
import random as rnd
import shutil
import gc
import datetime
from collections import defaultdict, Counter
from multiprocessing import Pool, cpu_count
import time
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def create_submission(df):
    df = df.reset_index()
    df["type"] = df["type"].map(CFG.contentType_mapper)
    df["session_type"] = df["session"].astype("str") + "_" + df["type"].astype("str") + "s"
    df = df[["session_type", "prediction"]].rename({"prediction": "labels"}, axis=1)
    return df
    

In [3]:
class CFG:
    contentType_mapper = pd.Series(["clicks", "carts", "orders"], index=[0, 1, 2])
    target_weight = (0.1, 0.3, 0.6)


In [4]:
fraction_of_sessions_to_use = 1

test = pd.read_parquet('./data/test.parquet')
subset_of_test = test

subset_of_test.index = pd.MultiIndex.from_frame(subset_of_test[['session']])

subsets = subset_of_test
sessions = subsets.session.unique()

In [5]:
session_types = ['clicks', 'carts', 'orders']
test_session_AIDs = subsets.reset_index(drop=True).groupby('session')['aid'].apply(list)
test_session_types = subsets.reset_index(drop=True).groupby('session')['type'].apply(list)
test_session_time_interval = subsets.reset_index(drop=True).groupby('session')['ts'].apply(lambda x: [np.log1p(1 / (i+1)) for i in ((x.max() - x) / 3600).values])

In [6]:
del test, subset_of_test; gc.collect()

0

In [7]:
n_aids = 20

# CORE HYPER-PARAMERTER 1 - weight by event types
type_weight_multipliers = {0: 0.1, 1: 0.6, 2: 0.3}

output = {
    "session": [],
    "type": [],
    "rec": [],
    "score": [],
}
# Rerank - Lịch sử aids
for SESS, AIDs, types, time_inteval in tqdm(zip(test_session_AIDs.index, test_session_AIDs.values, test_session_types.values, test_session_time_interval.values), total=len(test_session_AIDs)):

    candidates = Counter()
    for aid, w, t in zip(AIDs[::-1], time_inteval[::-1], types[::-1]):
        candidates.update({aid: w * type_weight_multipliers[t]})
    rec, score = zip(*candidates.most_common(n_aids))
    rec_list = " ".join([str(k) for k in list(rec)])
    score_list = " ".join([str(k) for k in list(np.round(score, 5))])                    
        
    output["session"].extend([SESS] * 3)
    output["type"].extend([0, 1, 2])
    output["rec"].extend([rec_list] * 3)
    output["score"].extend([score_list] * 3)

output = pd.DataFrame(output).set_index(["session", "type"])

100%|█████████████████████████████████████████████████████████████████████| 1671803/1671803 [01:26<00:00, 19429.89it/s]


In [8]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,rec,score
session,type,Unnamed: 2_level_1,Unnamed: 3_level_1
12899779,0,59625,0.06931
12899779,1,59625,0.06931
12899779,2,59625,0.06931
12899780,0,1142000 736515 973453 582732,0.13654 0.06905 0.06868 0.06799
12899780,1,1142000 736515 973453 582732,0.13654 0.06905 0.06868 0.06799
...,...,...,...
14571580,1,202353,0.06931
14571580,2,202353,0.06931
14571581,0,1100210,0.06931
14571581,1,1100210,0.06931


In [9]:
output.reset_index().to_parquet("./raw_output_rerank.parquet")

In [10]:
output["session_type"] = [str(i[0]) + "_" + str(CFG.contentType_mapper[i[1]]) for i in output.index]
submission_rerank = pd.read_csv("./data/sample_submission.csv")
submission_rerank = submission_rerank.set_index("session_type")
submission_rerank.loc[output["session_type"].values, "labels"] = output["rec"].values
submission_rerank = submission_rerank.reset_index()
submission_rerank.to_csv("./submission_rerank.csv", index=False)