# Baselines
To validate the skill of models I create in the future, I need to implement a relatively reasonable or even strong baseline. Then, I can continually compare, tune and improve dataflow and pipeline cycles, including data processing, feature engineering and model architecture design.

In [1]:
import os
import gc
import pickle
from tqdm import tqdm
from random import sample
import matplotlib.pyplot as plt 
import math
import json
from time import process_time

import pandas as pd 
import numpy as np 

from metadata import *
from fe import *
from utils.evaluator import Evaluator

In [None]:
# Variable definitions 

## Baseline 1 - Naive History Behavior

In [None]:
# Prepare data 
df = pd.read_parquet("./raw_data.parquet", ['dt', 'chid', 'shop_tag', 'txn_cnt', 'txn_amt'])
df = df[df['shop_tag'].isin(LEG_SHOP_TAG)]
df.reset_index(drop=True, inplace=True)
submission_template = pd.read_csv("./chid_target.csv")
print(f"#Customers to predict {len(submission_template)}")

In [None]:
# Sum transaction amount based on each consumption category per customer
txn_amt_sum = df.groupby(by=['chid', 'shop_tag']).agg({'txn_amt': 'sum'})
display(txn_amt_sum.head())
txn_amt_sum.reset_index(level='shop_tag', inplace=True)

In [None]:
# Use shop_tag with top3 frequency as the prediction of those
# who haven't spent on legitimate shop tags
leg_shop_tag_mode = txn_amt_sum['shop_tag'].value_counts().index[:3]
submission = {k: [] for k in ['chid', 'top1', 'top2', 'top3']}
# pd.DataFrame(columns=['chid', 'top1', 'top2', 'top3'])
for chid in tqdm(submission_template['chid']):
    chid_txn_amt_sum = txn_amt_sum[txn_amt_sum.index == chid]
    if len(chid_txn_amt_sum) == 0:
        shop_tag_top3 = leg_shop_tag_mode
    else:
        chid_txn_top3 = chid_txn_amt_sum.nlargest(3, columns='txn_amt')
        shop_tag_top3 = chid_txn_top3['shop_tag'].values
        if len(shop_tag_top3) > 3:
            shop_tag_top3 = shop_tag_top3[:3]
        elif len(shop_tag_top3) < 3:
            shop_tag_top3 = np.pad(shop_tag_top3, 
                                   pad_width=(0, 3-len(shop_tag_top3)),
                                   mode='edge')
    submission['chid'].append(chid)
    submission['top1'].append(shop_tag_top3[0])
    submission['top2'].append(shop_tag_top3[1])
    submission['top3'].append(shop_tag_top3[2])
    
submission = pd.DataFrame.from_dict(submission)
submission.to_csv("baseline1.csv", index=False)

## Baseline 2 - Weighted Amount

In [None]:
# Prepare data 
df = pd.read_parquet("./raw_data.parquet", ['dt', 'chid', 'shop_tag', 'txn_cnt', 'txn_amt'])
submission_template = pd.read_csv("./chid_target.csv")
print(f"#Customers to predict {len(submission_template)}")

In [None]:
def n_consumes(gp):
    '''Return number of samples in each group grouped by 'shop_tag'.
    '''
    return len(gp)

def weighted_amt(df, txn_amt_mean2, n_consumes_thres):
    '''Calculate weighted amount for each legitimate shop_tag.
    
    Parameters:
        df: pd.DataFrame, information related to average transaction amount
            and number of total consumption of each shop_tag
        txn_amt_mean2: float, average over means of transaction amount of 
                       all the legitimate shop_tags
        n_consumes_thres: float, threshold of number of consumption
    
    Return:
        w_amt: pd.Series, weighted amount of each shop_tag
    '''
    v = df['n_consumes'] 
    R = df['txn_amt_mean']
    w_amt = ((v/(v+n_consumes_thres) * R) + 
            (n_consumes_thres/(n_consumes_thres+v) * txn_amt_mean2))

    return w_amt
 
df_ = df.groupby(by=['shop_tag']).agg({'txn_amt': [np.mean], 'txn_cnt': ['count']})   # 'count' can be replaced
                                                                                      # by n_consumes
df_.columns = ['txn_amt_mean', 'n_consumes']
txn_amt_mean2 = df_['txn_amt_mean'].mean()   # Take mean of mean of transaction amount
n_consumes_thres = df_.quantile(0.8)['n_consumes']   # Threshold indicating minimum number of consumptions 
                                                     # needed for shop_tag to be taken in 
df_ = df_[df_['n_consumes'] >= n_consumes_thres]
df_['weighted_amt'] = weighted_amt(df_, txn_amt_mean2, n_consumes_thres)
df_.sort_values(by=['weighted_amt'], ascending=False, inplace=True)

In [None]:
shop_tags_top3 = [shop_tag for shop_tag in df_.index if shop_tag in LEG_SHOP_TAG][:3]
submission = np.tile(shop_tags_top3, reps=(len(submission_template), 1))
submission = pd.DataFrame(submission, columns=[f'top{k}' for k in range(1, 4)])
submission.insert(0, column='chid', value=submission_template['chid'])
submission.to_csv("./baseline2.csv", index=False)

## Baseline 3 - TIFU-KNN Based 
### Concept
1. Get predicting client representation using the concept of TIFU-KNN.
2. Select the most potential `shop_tag`s that each client is willing to consume.
    * Fixed threshold of #`shop_tag`s
    * Client-specific threshold of #`shop_tag`s based on average #`shop_tag`s consumed per month 
    * Train separate classifiers

In [2]:
def get_pif(purch_map, t1, t2):
    '''Return personalized item frequency computed from the given time
    interval.
    
    Parameters:
        purch_map: ndarray, purchasing map indicating purchasing the 
                   shop_tag or not (i.e., represented by 0/1)
        t1: int, time lower bound
        t2: int, time upper bound
    
    Return:
        pif: ndarray, personalized item frequency vector
    '''
    # Align with array index
    purch_map = purch_map[t1:t2]
    pif = np.sum(purch_map)
    return pif

class CliVecGenerator:
    def __init__(self, purch_map_path, t1, t2, 
                 gp_size, decay_wt_g, decay_wt_b):
        with open(purch_map_path, 'rb') as f:
            self.purch_maps = pickle.load(f)
        self.t1 = t1
        self.t2 = t2  
        self.gp_size = gp_size
        self.decay_wt_g = decay_wt_g
        self.decay_wt_b = decay_wt_b
        self._setup()
        
    def get_client_vec(self, chid):
        '''Return the client vector represented by fusing repeated purchase
        pattern and collaborative one.

        Parameters:
            chid: int, client identifier

        Return:
            client_vec: ndarray, client vector representation
        '''
        purch_map = self.purch_maps[chid][self.t1:self.t2]
        if self.first_gp_size != 0:
            first_gp = purch_map[:self.first_gp_size]
            first_gp = first_gp * self.wt_g[0]
            first_gp = np.einsum('ij, i->j', first_gp, self.wt_b[self.first_gp_size:])
            
        normal_gps = np.reshape(purch_map[self.first_gp_size:], 
                                self.normal_gp_shape)   
        normal_gps = np.einsum('ijk, i->jk', normal_gps, self.normal_gp_wt)
        normal_gps = np.einsum('ij, i->j', normal_gps, self.wt_b)
        client_vec = normal_gps if self.first_gp_size == 0 else first_gp + normal_gps
    
        return client_vec#np.expand_dims(client_vec, axis=0)
    
    def _setup(self):
        self.n_baskets = self.t2 - self.t1   # See one month as one basket
                                             # time interval is like [t1, t2)
        self.n_gps = math.ceil(self.n_baskets / self.gp_size)
        self.wt_g = [pow(self.decay_wt_g, p) for p in range(self.n_gps-1, -1, -1)]
        self.wt_b = [pow(self.decay_wt_b, p) for p in range(self.gp_size-1, -1, -1)]
        
        self.first_gp_size = self.n_baskets % self.gp_size
        if self.first_gp_size == 0:
            # If each group has the same size
            self.normal_gp_shape = (self.n_gps, self.gp_size, -1)
            self.normal_gp_wt = self.wt_g
        else:
            self.normal_gp_shape = (self.n_gps-1, self.gp_size, -1)   # Ignore the first gp
            self.normal_gp_wt = self.wt_g[1:]   # Ignore the first gp

In [4]:
def cv(params):
    '''Do cross-validation and return the performance.
    
    Parameters:
        params: dict, hyperparameters used in the current process
    
    Return:
        NDCGs: list, NDCGs of different folds
    '''
    NDCGs = []
    for t_interval in [(0, 23)]:#[(0, 20), (0, 21), (0, 22)]:
        cli_vecs = get_cli_vecs(t1=t_interval[0], 
                                t2=t_interval[1], 
                                gp_size=params['gp_size'],
                                decay_wt_g=params['decay_wt_g'], 
                                decay_wt_b=params['decay_wt_b'])
        
        pred_vecs = get_pred_vecs(cli_vecs=cli_vecs, 
                                  n_neighbor_candidates=params['n_neighbor_candidates'],
                                  sim_measure=params['sim_measure'],
                                  k=params['k'],
                                  alpha=params['alpha'])
        
        t_range = (t_interval[0]+1, t_interval[1]+1)   # Algin with original 'dt' values
        final_ranks_pred = get_final_ranks(pred_vecs, t_range=t_range)
        evaluator = Evaluator(data_path="./data/raw/raw_data.parquet", 
                              pred=final_ranks_pred, 
                              t_next=t_interval[1]+1)
        NDCGs.append(evaluator.evaluate())
    
    return final_ranks_pred, NDCGs

def get_cli_vecs(t1, t2, gp_size, decay_wt_g, decay_wt_b):
    cli_vec_generator = CliVecGenerator("./data/processed/purch_maps.pkl", 
                                        t1=t1, 
                                        t2=t2, 
                                        gp_size=gp_size, 
                                        decay_wt_g=decay_wt_g, 
                                        decay_wt_b=decay_wt_b)
    cli_vecs = {}
    for chid in tqdm(cli_vec_generator.purch_maps.keys()):
        cli_vecs[chid] = cli_vec_generator.get_client_vec(chid)
    
    return cli_vecs

def get_pred_vecs(cli_vecs, n_neighbor_candidates, sim_measure, k, alpha):
    pred = {}
    cli_map = np.array([v for v in cli_vecs.values()])
    
    for chid, target_vec in tqdm(cli_vecs.items()):
        sim_map = {}
        un = np.zeros(N_SHOP_TAGS)
        neighbor_candidates = sample(range(N_CLIENTS), n_neighbor_candidates)
        
#         neighbor_candidates = sample(cli_vecs.keys(), n_neighbor_candidates)
        
#         print(neighbor_candidates)
#         t1 = process_time()
        neighbor_mat = cli_map[neighbor_candidates]
#         neighbor_mat = [cli_vecs[chid_] for chid_ in neighbor_candidates 
#                         if chid != chid_]
#         neighbor_mat = np.array(neighbor_mat)
#         print(neighbor_mat.shape)
        
#         t2 = process_time()
#         print(f"Neighbor matrix {t2-t1}sec")

#         t1 = process_time()
        
        if sim_measure == 'cos':
            dot_sim = np.matmul(neighbor_mat, target_vec)
            target_norm = np.linalg.norm(target_vec)
            neighbor_norm = np.linalg.norm(neighbor_mat, axis=1)
            sim_vec = dot_sim / (target_norm * neighbor_norm) 
        elif sim_measure == 'ed':
            vec_sub = neighbor_mat - target_vec
            sim_vec = np.linalg.norm(vec_sub, axis=1)
#         t2 = process_time()
#         print(f"Sim measure {t2-t1}sec")
        
#         t1 = process_time()
        
        sim_map = {chid_: sim for chid_, sim in zip(neighbor_candidates, sim_vec)}
        sim_map = dict(sorted(sim_map.items(), 
                              key=lambda item: item[1], 
                              reverse=True))
        neighbors = list(sim_map.keys())[:k]
        
#         t2 = process_time()
#         print(f"Take topk {t2-t1}sec")
        
#         t1 = process_time()
        
        for n in neighbors:
            un += cli_vecs[n+int(1e7)]
        un = un / k
        pred[chid] = alpha*target_vec + (1-alpha)*un
        
#         t2 = process_time()
#         print(f"Final pred {t2-t1}sec")
        del sim_map, un, neighbor_candidates, neighbor_mat, neighbors
    
    return pred

def get_final_ranks(pred, t_range):
    df = pd.read_parquet("./data/raw/raw_data.parquet", 
                         columns=['dt', 'chid', 'shop_tag', 'txn_amt'])
    avg_shop_tags = get_avg_shop_tags_per_month(df[['dt', 'chid', 'shop_tag']], t_range)
    avg_txn_amt = get_avg_txn_amt_per_basket(df, t_range)
    
    final_ranks = {col: [] for col in ['chid', 'top1', 'top2', 'top3']}
    for chid, pred_vec in tqdm(pred.items()):
        shop_tag_top3 = {}
        k = round(avg_shop_tags[chid], 0)
        txn_amt = avg_txn_amt[chid]
        shop_tags_ranked = np.argsort(pred_vec)[::-1]   # Notice that this is idx list
        try:
            shop_tags_topk = shop_tags_ranked[:int(k)]
        except:
            print(k)
            break
        txn_amt_topk = {shop_tag: txn_amt[shop_tag] for shop_tag in shop_tags_topk}
        shop_tags_topk_ranked = dict(
                                    sorted(txn_amt_topk.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True)
                                ).keys()
        for shop_tag in shop_tags_topk_ranked:
            if shop_tag+1 in LEG_SHOP_TAGS:
                shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag+1
            if len(shop_tag_top3) == 3:
                # If top3 shop tags have been captured so far
                break
        if len(shop_tag_top3) < 3:
            for shop_tag in shop_tags_ranked[int(k):]:
                if shop_tag+1 in LEG_SHOP_TAGS:
                    shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag+1
                if len(shop_tag_top3) == 3:
                    # If top3 shop tags have been captured so far
                    break
        shop_tag_top3['chid'] = chid
        for k, v in shop_tag_top3.items():
            final_ranks[k].append(v)

        del shop_tag_top3, txn_amt, shop_tags_ranked, \
            shop_tags_topk, txn_amt_topk, shop_tags_topk_ranked
    
    final_ranks = pd.DataFrame(final_ranks)
    
    return final_ranks

In [None]:
{'gp_size': 3, 'decay_wt_g': 0.8, 'decay_wt_b': 0.9, 'alpha': 0.9, 'sim_measure': 'cos', 'k': 100, 'n_neighbor_candidates': 250}

In [None]:
from itertools import product
param_sets = {
    'gp_size': [3, 6],
    'decay_wt_g': [0.8, 0.5],
    'decay_wt_b': [0.9, 0.6],
    'alpha': [0.9, 0.6, 0.3],
    'sim_measure': ['cos', 'ed'],
    'k': [100, 500],
    'n_neighbor_candidates': [250, 1000],
}

perf = []
grid = list(product(*param_sets.values()))

for params in grid:
    params = {k: v for k, v in zip(param_sets.keys(), params)}
    if params['k'] > params['n_neighbor_candidates']:
        continue
    elif params in done:
        continue
    ndcg = cv(params)
    print(f"{params}: {ndcg}")
    perf.append((params, ndcg))

with open("./perf2.pkl", 'wb') as f:
    pickle.dump(perf, f)

  2%|▏         | 11614/500000 [00:00<00:08, 56338.82it/s]

In [39]:
param, perf_best = {}, 0
for p in perf:
    if np.mean(p[1]) > perf_best:
        param = p[0]
        perf_best = np.mean(p[1])

In [5]:
final_ranks, ndcg = cv({'gp_size': 3, 'decay_wt_g': 0.5, 'decay_wt_b': 0.9, 'alpha': 0.9, 'sim_measure': 'cos', 'k': 100, 'n_neighbor_candidates': 1000})

100%|██████████| 500000/500000 [00:08<00:00, 57165.45it/s]
  sim_vec = dot_sim / (target_norm * neighbor_norm)
100%|██████████| 500000/500000 [07:58<00:00, 1044.50it/s]
100%|██████████| 500000/500000 [00:06<00:00, 78528.43it/s]
100%|██████████| 500000/500000 [00:05<00:00, 91615.27it/s]


In [7]:
final_ranks

Unnamed: 0_level_0,top1,top2,top3
chid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000000,10,12,37
10000001,48,37,15
10000002,22,37,15
10000003,37,22,10
10000004,37,12,36
...,...,...,...
10499995,10,37,2
10499996,15,48,36
10499997,48,37,15
10499998,10,39,19


In [8]:
with open("./data/gt/t_24.pkl", 'rb') as f:
    gt = pickle.load(f)

In [9]:
gt

{10000000: array([11660.80755943,  9156.08603284,     0.        ]),
 10000001: array([10223.42143207,  7060.14289389,  4435.60049348]),
 10000002: array([0., 0., 0.]),
 10000003: array([0., 0., 0.]),
 10000004: array([0., 0., 0.]),
 10000005: array([10062.52376493,  6221.65630882,     0.        ]),
 10000006: array([9965.79521444,    0.        ,    0.        ]),
 10000007: array([0., 0., 0.]),
 10000008: array([18212.19008094,  7240.64634237,  4443.37985181]),
 10000009: array([3238.34042857,    0.        ,    0.        ]),
 10000010: array([7278.54850205, 5773.03648466, 2497.66593316]),
 10000011: array([0., 0., 0.]),
 10000012: array([98684.47845571,  3511.20972929,     0.        ]),
 10000013: array([4110.60911278, 3046.42381253,    0.        ]),
 10000014: array([8973.26263599, 8019.90666639, 5773.03648466]),
 10000015: array([11916.86925452,  8887.81278758,     0.        ]),
 10000016: array([3115.51743225,    0.        ,    0.        ]),
 10000017: array([797.16566313,   0.      

In [None]:
exp_version = 3
cfg = {
    't1': 0,
    't2': 23,
    'gp_size': 3,   # Take 3 to represent seasonal effect under the premise 
                    # that time point represents 'month'
    # Changes across groups (seasons) may be dramatic, 
    # so take faster decaying (smaller wt)
    'decay_wt_g': 0.7,
    'decay_wt_b': 0.9,
    'alpha': 0.7,
    'sim_measure': 'cos',   # Similarity measurement
    'k': 50,   # #Nearest neighbors
    'n_neighbor_candidates': 250   # #Candidate neighbors to consider
}
# with open(f"./exp/tifu-knn/version{v}/cfg.json", 'w') as f:
#     json.dump(cfg, f)

In [None]:
cli_vec_generator = CliVecGenerator("./data/processed/purch_maps.pkl", 
                                    t1=cfg['t1'], 
                                    t2=cfg['t2'], 
                                    gp_size=cfg['gp_size'], 
                                    decay_wt_g=cfg['decay_wt_g'], 
                                    decay_wt_b=cfg['decay_wt_b'])
cli_vecs = {}
for chid in tqdm(cli_vec_generator.purch_maps.keys()):
    cli_vecs[chid] = cli_vec_generator.get_client_vec(chid)

In [None]:
cosine_sim = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1)*np.linalg.norm(v2))
euclidean_sim = lambda v1, v2: np.linalg.norm(v1 - v2)

k = cfg['k']
alpha = cfg['alpha']

pred = {}
for chid, target_vec in tqdm(cli_vecs.items()):
#     target_vec = np.squeeze(target_vec)
    sim_map = {}
    un = np.zeros(N_SHOP_TAGS)
    neighbor_candidates = sample(cli_vecs.keys(), cfg['n_neighbor_candidates'])
    
#     neighbor_mat = [np.expand_dims(cli_vecs[chid_], axis=0) for chid_ in neighbor_candidates 
#                     if chid != chid_]
    neighbor_mat = [cli_vecs[chid_] for chid_ in neighbor_candidates 
                    if chid != chid_]
    neighbor_mat = np.array(neighbor_mat)
#     neighbor_mat = np.array(neighbor_mat)
#     neighbor_mat = np.vstack(neighbor_mat)
#     neighbor_mat = np.concatenate(neighbor_mat, axis=0)
    
    if cfg['sim_measure'] == 'cos':
        dot_sim = np.matmul(neighbor_mat, target_vec)
        target_norm = np.linalg.norm(target_vec)
        neighbor_norm = np.linalg.norm(neighbor_mat, axis=1)
        sim_vec = dot_sim / (target_norm * neighbor_norm) 
        sim_map = {chid_: sim for chid_, sim in zip(neighbor_candidates, sim_vec)}
    elif cfg['sim_measure'] == 'ed':
        pass
    
#     for chid_  in neighbor_candidates:
#         if chid == chid_:
#             continue
#         sim_map[chid_] = cosine_sim(target_vec, cli_vecs[chid_])
     
#     t2 = process_time()
#     print(f"Sim measure takes {t2-t1}s.")
    
    sim_map = dict(sorted(sim_map.items(), 
                          key=lambda item: item[1], 
                          reverse=True))
    neighbors = list(sim_map.keys())[:k]
    for n in neighbors:
        un += cli_vecs[n]#np.squeeze(cli_vecs[n])
    un = un / k
    pred[chid] = alpha*target_vec + (1-alpha)*un
    del sim_map, un, neighbor_candidates, neighbor_mat, neighbors

# with open("./tifu_knn_v2.pkl", 'wb') as f:
#     pickle.dump(pred, f)

In [None]:
with open("./tmp.pkl", 'rb') as f:
    pred = pickle.load(f)

In [None]:
# Prepare data
df = pd.read_parquet("./data/raw/raw_data.parquet", columns=['dt', 'chid', 'shop_tag', 'txn_amt'])
# with open("./tifu_knn_v2.pkl", 'rb') as f:
#     pred =  pickle.load(f)


In [None]:
# Generate the final ranks
k = None   # Fixed number indicating that top-k potential items will be considered 
t_range = (1, 24)   # Set time interval to generate features
                    # Before coming up with val scheme, set to full
                    # And observe performance using leaderboard (no cv)
                    # tmp solution (bad)

avg_shop_tags = get_avg_shop_tags_per_month(df[['dt', 'chid', 'shop_tag']], t_range)
avg_txn_amt = get_avg_txn_amt_per_basket(df, t_range)

In [None]:
# 
submission_template = pd.read_csv("./data/raw/chid_target.csv")
submission = {col: [] for col in submission_template.columns}
for chid, pred_vec in tqdm(pred.items()):
    shop_tag_top3 = {}
    k = round(avg_shop_tags[chid], 0)
    txn_amt = avg_txn_amt[chid]
    shop_tags_ranked = np.argsort(pred_vec)[::-1]   # Notice that this is idx list
    try:
        shop_tags_topk = shop_tags_ranked[:int(k)]
    except:
        print(k)
        break
    txn_amt_topk = {shop_tag: txn_amt[shop_tag] for shop_tag in shop_tags_topk}
    shop_tags_topk_ranked = dict(
                                sorted(txn_amt_topk.items(), 
                                       key=lambda x: x[1], 
                                       reverse=True)
                            ).keys()
    for shop_tag in shop_tags_topk_ranked:
        if shop_tag+1 in LEG_SHOP_TAGS:
            shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag+1
        if len(shop_tag_top3) == 3:
            # If top3 shop tags have been captured so far
            break
    if len(shop_tag_top3) < 3:
        for shop_tag in shop_tags_ranked[int(k):]:
            if shop_tag+1 in LEG_SHOP_TAGS:
                shop_tag_top3[f'top{len(shop_tag_top3)+1}'] = shop_tag+1
            if len(shop_tag_top3) == 3:
                # If top3 shop tags have been captured so far
                break
    shop_tag_top3['chid'] = chid
    for k, v in shop_tag_top3.items():
        submission[k].append(v)

    del shop_tag_top3, txn_amt, shop_tags_ranked, \
        shop_tags_topk, txn_amt_topk, shop_tags_topk_ranked
    
pred = pd.DataFrame(submission)
# submission.to_csv("baseline3_v2.csv", index=False)

In [None]:
evaluator = Evaluator(data_path="./data/raw/raw_data.parquet", 
                      pred=pred, 
                      t_next=24)

In [None]:
NDCG_avg = evaluator.evaluate()
NDCG_avg

In [None]:
# Hyperparameter tuning
params = {
    'gp_size': 3,   # Take 3 to represent seasonal effect under the premise 
                    # that time point represents 'month'
    # Changes across groups (seasons) may be dramatic, 
    # so take faster decaying (smaller wt)
    'decay_wt_g': 0.7,
    'decay_wt_b': 0.9,
    'alpha': 0.7,
    'sim_measure': 'cos',   # Similarity measurement
    'k': 50,   # #Nearest neighbors
    'n_neighbor_candidates': 250   # #Candidate neighbors to consider
}