# Ranking Task
This notebook aims at trying some thoughts on the final ranking task, which either takes the predicting results from the first level task (*i.e.*, binary classification) or directly solves the problem by an **end-to-end** architecture.

In [1]:
# Import packages
import os 
import pickle

import wandb 

from metadata import * 
from fe import *

In [2]:
# Variable definitions
model_name = 'lgbm'
infer_version = 0
pred_month = 25

In [None]:
# Pull dumped outputs from Wandb
ranking = wandb.init(project='Esun',
                     job_type='bi-level-ranking')   # Modeling ranking as a bi-level optimization problem
output = ranking.use_artifact(f'{model_name}_infer:v{infer_version}', type='output')
output_dir = output.download()

In [4]:
# Prepare predicting results for downstream ranking 
## dumping path seems corrupt???##
with open(os.path.join(output_dir, f"{pred_month}.pkl"), 'rb') as f:
    pred_result = pickle.load(f)

In [5]:
pred_result.reset_index(level='shop_tag', inplace=True)
pred_result.head()

Unnamed: 0_level_0,shop_tag,y_pred
chid,Unnamed: 1_level_1,Unnamed: 2_level_1
10000000,2,0.214484
10000000,6,0.180116
10000000,10,0.627519
10000000,12,0.27628
10000000,13,0.302098


In [6]:
# Create mapping relationship from legitimate shop_tag index (indexed 0-15)
# to position in the transaction amount vector (indexed 0-48)
shop_tag_map = {idx: shop_tag-1 for idx, shop_tag in zip(range(len(LEG_SHOP_TAGS)), 
                                                         LEG_SHOP_TAGS)}
shop_tag_map

{0: 1,
 1: 5,
 2: 9,
 3: 11,
 4: 12,
 5: 14,
 6: 17,
 7: 18,
 8: 20,
 9: 21,
 10: 24,
 11: 25,
 12: 35,
 13: 36,
 14: 38,
 15: 47}

In [21]:
def get_final_ranks(pred, t_range):
    df = pd.read_parquet("./data/raw/raw_data.parquet", 
                         columns=['dt', 'chid', 'shop_tag', 'txn_amt'])
    avg_shop_tags = get_avg_shop_tags_per_month(df[['dt', 'chid', 'shop_tag']], 
                                                t_range)
    avg_txn_amt = get_avg_txn_amt_per_basket(df, t_range)
    
    final_ranks = {col: [] for col in ['chid', 'top1', 'top2', 'top3']}
    for chid, pred_vec in tqdm(pred.items()):
        shop_tag_top3 = {}
        k = round(avg_shop_tags[chid], 0)
        txn_amt = avg_txn_amt[chid]
        shop_tags_ranked = np.argsort(pred_vec)[::-1]   # Notice that this is idx list
        try:
            shop_tags_topk = shop_tags_ranked[:3]   # Explicitly specify top3
#             shop_tags_topk = shop_tags_ranked[:int(k)]   # k from history behavior
        except:
            print(k)
            break
        # shop_tag_map[shop_tag] == orig_shop_tag - 1 (because it's an index)
        txn_amt_topk = {shop_tag_map[shop_tag]: txn_amt[shop_tag_map[shop_tag]] 
                        for shop_tag in shop_tags_topk} 
        shop_tags_topk_ranked = dict(
                                    sorted(txn_amt_topk.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True)
                                ).keys()
        for shop_tag in shop_tags_topk_ranked:
            if shop_tag+1 in LEG_SHOP_TAGS:
                shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag+1
            if len(shop_tag_top3) == 3:
                # If top3 shop tags have been captured so far
                break
        if len(shop_tag_top3) < 3:
            # X不夠就要補滿，思考一下是不是應該要補最不可能的
            # X因為現在是遞補剩下的shop_tag最有可能的那個
            # 如果顧客一直都只買一兩項，
            # dt25真的只買兩項 --> 補誰都一樣 分數不變
            # dt25買了三項 --> 需要補最有可能的吧!
            for shop_tag in shop_tags_ranked[int(k):]:
                # Fix to real shop_tag by mapping
                shop_tag_ = shop_tag_map[shop_tag] + 1
                if shop_tag_ in LEG_SHOP_TAGS:
                    shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag_
                if len(shop_tag_top3) == 3:
                    # If top3 shop tags have been captured so far
                    break
        shop_tag_top3['chid'] = chid
        for k, v in shop_tag_top3.items():
            final_ranks[k].append(v)

        del shop_tag_top3, txn_amt, shop_tags_ranked, \
            shop_tags_topk, txn_amt_topk, shop_tags_topk_ranked
    
    final_ranks = pd.DataFrame(final_ranks)
    
    return final_ranks

In [22]:
pred_vecs = {}
for chid, pred_vec in tqdm(pred_result.groupby(pred_result.index)):
    pred_vecs[chid] = pred_vec['y_pred'].values
t_range = (0, 25)
final_ranks = get_final_ranks(pred_vecs, t_range)
final_ranks.to_csv("./lgbm_naive_ranking.csv", index=False)

100%|██████████| 500000/500000 [00:04<00:00, 111294.67it/s]
