In [1]:
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.optimize import curve_fit
from joblib import dump, load


import sys
sys.path.insert(0, '../')

from utils.preprocess import *
from utils.evaluate import mapk, apk



rp_cols = ['customer_id' , 'article_id','ppd_min' , 
           'ppd_max', 'rcp', 'denom_customer', 'num_customer', 
           'idxgrp_idx_prdtyp']
tmp_cols = ['t_dat','customer_id' , 'article_id'
            ,'quotient' , 'value', 'y']

rp_cid_col_df = pd.read_csv('./rp_cid_col_df.csv', usecols=rp_cols)
tmp = pd.read_csv('./tmp.csv', usecols=tmp_cols)

# folder = '../../../h-and-m-recommender/data/'
# art_df = pd.read_csv(os.path.join(folder, "articles.csv"))
# cus_df = pd.read_csv(os.path.join(folder, "customers.csv"))
# trans_df = pd.read_csv(os.path.join(folder, "transactions_train.csv"))
# trans_df = make_weeknum_col(trans_df)
# week_num 0 is latest week of the transaction
# _, val_df = split_train_valid(trans_df, 0)
# val_df['t_dat'] = pd.to_datetime(val_df.t_dat)
# validation_df = valid2submission(val_df)
# validation_df.to_csv('./validation_df.csv', index=False)

validation_df = pd.read_csv('./validation_df.csv')



In [2]:
rp_aid = rp_cid_col_df.groupby('article_id')[['article_id', 'rcp', 'idxgrp_idx_prdtyp','denom_customer', 'num_customer']].tail(1)



In [3]:
q1, q3 = np.percentile(rp_aid.rcp, [25,75])
iqr = q3-q1

l_bound = q1 - (iqr * 1.5)
r_bound = q3 + (iqr * 1.5)

normal_idx = rp_aid[(rp_aid.rcp > l_bound) & 
                    (rp_aid.rcp < r_bound) & 
                    (rp_aid.denom_customer >= 30)].index

rp_aid['rcp_normal'] = rp_aid.index.isin(normal_idx)

In [4]:
new_tmp = pd.merge(tmp, rp_aid, on='article_id', how='left')

In [5]:
new_tmp['rcp_mean'] = new_tmp.rcp.fillna(new_tmp.rcp.mean())
new_tmp['rcp_min'] = new_tmp.rcp.fillna(new_tmp.rcp.min())
new_tmp['rcp_zero'] = new_tmp.rcp.fillna(0)

new_tmp['rcp_normal_mean'] = new_tmp.rcp_normal.fillna(new_tmp.rcp_normal.mean())
new_tmp['rcp_normal_min'] = new_tmp.rcp_normal.fillna(new_tmp.rcp_normal.min())
new_tmp['rcp_normal_zero'] = new_tmp.rcp_normal.fillna(0)



In [6]:
for col in ['rcp', 'rcp_normal']:
    for fill_val in ['mean', 'min', 'zero']:
        new_tmp[f'new_val_{col}_{fill_val}'] = new_tmp['value'] * new_tmp[f'{col}_{fill_val}']
        

In [8]:
import warnings
warnings.filterwarnings("ignore")

results = []
for col in ['rcp', 'rcp_normal']:
    for fill_val in ['mean', 'min', 'zero']:
        for cut_off in [0.0, 0.1, 0.25]:
            
            new_value_col = f'new_val_{col}_{fill_val}'
            cut_value = new_tmp[new_value_col].quantile(cut_off) 
            cut_new_tmp = new_tmp.loc[new_tmp[new_value_col] >= cut_value]
            
            cut_new_tmp['rank'] = cut_new_tmp.groupby("customer_id")[new_value_col]\
                                             .rank("dense", ascending=False)
            cut_new_tmp = cut_new_tmp.loc[cut_new_tmp['rank'] <= 12]
            
            purchase_df = cut_new_tmp.sort_values(['customer_id', new_value_col], ascending = False).reset_index(drop = True)
            
            purchase_df['prediction'] = '0' + purchase_df['article_id'].astype(str) + ' '
            purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
            purchase_df['prediction'] = purchase_df['prediction'].str.strip()
            
            merged = pd.merge(purchase_df, validation_df, on='customer_id', how='right')
            merged['prediction'].fillna('', inplace=True)

            metric = mapk(
                merged['article_id'].map(lambda x: x.split()), 
                merged['prediction'].map(lambda x: x.split()), 
                k=12
            )
            results.append([col, fill_val, cut_off, metric])
            print(col, fill_val, cut_off, metric)
            del merged, purchase_df, cut_new_tmp; gc.collect()
            
            

rcp mean 0.0 0.02308062332794654
rcp mean 0.1 0.023078293595172068
rcp mean 0.25 0.023059898694558104
rcp min 0.0 0.02299729543487087
rcp min 0.1 0.0229949657020964
rcp min 0.25 0.02297632919956508
rcp zero 0.0 0.022704523458897598
rcp zero 0.1 0.022591492782203612
rcp zero 0.25 0.022578524720847513
rcp_normal mean 0.0 0.020040188741149826
rcp_normal mean 0.1 0.019655375268956363
rcp_normal mean 0.25 0.01964251075127913
rcp_normal min 0.0 0.019543588706985273
rcp_normal min 0.1 0.01904355244777194
rcp_normal min 0.25 0.01902906574579249
rcp_normal zero 0.0 0.019543588706985273
rcp_normal zero 0.1 0.01904355244777194
rcp_normal zero 0.25 0.01902906574579249


In [9]:
results_df = pd.DataFrame(results)
results_df.columns = ['col', 'fill_val', 'cut_off', 'map@12']

In [10]:
results_df.sort_values('map@12',ascending=False)

Unnamed: 0,col,fill_val,cut_off,map@12
0,rcp,mean,0.0,0.023081
1,rcp,mean,0.1,0.023078
2,rcp,mean,0.25,0.02306
3,rcp,min,0.0,0.022997
4,rcp,min,0.1,0.022995
5,rcp,min,0.25,0.022976
6,rcp,zero,0.0,0.022705
7,rcp,zero,0.1,0.022591
8,rcp,zero,0.25,0.022579
9,rcp_normal,mean,0.0,0.02004
