# Alternating Least Square (ALS)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from interaction_table import orders_weigher, InteractionTable
from process_data import preprocess_orders_and_clicks, additional_filtration_orders_and_clicks
from h3_index import H3Index

In [None]:
#!pip install fastparquet
h3index = H3Index('../data/h3_to_chains.pkl')

In [3]:
# !mkdir -p ../data/moscow_slice
# preprocess_orders_and_clicks(
#     path_to_orders="../data/orders",
#     path_to_clicks="../data/clicks",
#     save_path="../data/moscow_slice",
# )

In [4]:
def get_clicks():
    path = '../data/clicks/'
    clicks = pd.read_parquet(f'{path}/clicks.parquet')
    return clicks

def get_orders():
    path = '../data/moscow_slice/'
    orders = pd.read_parquet(f'{path}/orders.parquet')
    orders = orders.rename(columns={"customer_id": "user_id"})
    clicks = pd.read_parquet(f'{path}/clicks.parquet')
    #regs = pd.read_pickle('../data/CITIES_MAPPING.pkl')
    #regs = [v for k, v in regs.items() if v > 2]
    regs = [1] # moscow
    orders, _ = additional_filtration_orders_and_clicks(orders, clicks, regs_to_filter=regs)
    return orders

In [5]:
interactions = InteractionTable(get_orders(), None)

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.105842e+06  3.105842e+06  3.105842e+06
mean   3.666711e+07  3.211560e+04  1.755639e+00
std    2.148127e+07  1.517040e+04  8.204565e+01
min    0.000000e+00  9.000000e+00  1.000000e+00
25%    1.143679e+07  2.714700e+04  1.000000e+00
50%    3.991122e+07  3.006000e+04  1.000000e+00
75%    5.176011e+07  4.451900e+04  2.000000e+00
max    7.213893e+07  7.325500e+04  1.444470e+05
Orders df weighted: size=3105842, uniq_users=1394011, uniq_chains=7269


In [6]:
interactions.sparse_interaction_matrix

<7269x1394011 sparse matrix of type '<class 'numpy.int64'>'
	with 3105842 stored elements in Compressed Sparse Row format>

In [7]:
test = interactions.interaction_df[['user_id', 'weight']]
test = test.groupby('user_id').sum()
test = test.reset_index()[['user_id', 'weight']]
user_with_few_interactions = set(test[test['weight'] <= 2].user_id.unique())
ncf_interactions = interactions.interaction_df
ncf_interactions = ncf_interactions.query('user_id not in @user_with_few_interactions')
ncf_interactions.to_parquet('../data/moscow_slice/ncf_orders.parquet')
print("ncf_interactions:", len(ncf_interactions))

ncf_valid_users = set(ncf_interactions['user_id'].unique())
print("ncf_uniq_users:", len(ncf_valid_users))
ncf_valid_chains = set(ncf_interactions['chain_id'].unique())
print("ncf_uniq_chains:", len(ncf_valid_chains))

ncf_interactions: 2142482
ncf_uniq_users: 598092
ncf_uniq_chains: 7231


In [53]:
val_df = pd.read_pickle('../data/test_VALID.pkl')
val_df = val_df[['customer_id', 'h3', 'chain_id']]
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)
print("initial:")
print("df, uniq_users, uniq_chains:", len(val_df), len(val_df.user_id.unique()), len(val_df.chain_id.unique()))

val_df = val_df.query('h3 in @h3index.valid')
print()
print("after invalid h3 filtering:")
print("df, uniq_users, uniq_chains:", len(val_df), len(val_df.user_id.unique()), len(val_df.chain_id.unique()))

val_df = val_df.query('user_id in @ncf_valid_users')
print()
print("after invalid users filtering:")
print("df, uniq_users, uniq_chains:", len(val_df), len(val_df.user_id.unique()), len(val_df.chain_id.unique()))

val_df = val_df.query('chain_id in @ncf_valid_chains')
print()
print("after invalid chains filtering:")
print("df, uniq_users, uniq_chains:", len(val_df), len(val_df.user_id.unique()), len(val_df.chain_id.unique()))

val_df = val_df.drop_duplicates()
print()
print("after dropping duplicates:")
print("df, uniq_users, uniq_chains:", len(val_df), len(val_df.user_id.unique()), len(val_df.chain_id.unique()))

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 377666 145478 8991

after invalid chains filtering:
df, uniq_users, uniq_chains: 276678 126067 5077

after dropping duplicates:
df, uniq_users, uniq_chains: 202992 126067 5077


In [54]:
%%time
val_df["valid_chain"] = val_df["h3"].map(h3index.h3_to_chains)
val_df = val_df.explode("valid_chain")
val_df = val_df.query('valid_chain in @ncf_valid_chains')
val_df["h3"] = val_df["h3"].map(h3index.h3_to_index)
val_df = val_df.rename(columns={"chain_id": "test_chain_id"})
val_df = val_df.rename(columns={"valid_chain": "chain_id"})
val_df.head()

CPU times: user 1min 22s, sys: 1min 20s, total: 2min 42s
Wall time: 3min 25s


Unnamed: 0,user_id,h3,test_chain_id,chain_id
0,386249,81446,29454,9
0,386249,81446,29454,57364
0,386249,81446,29454,37
0,386249,81446,29454,41
0,386249,81446,29454,42


In [64]:
print("df, uniq_users, uniq_test_chains, uniq_chains:",
      len(val_df), len(val_df.user_id.unique()),
      len(val_df.test_chain_id.unique()), len(val_df.chain_id.unique()))

df, uniq_users, uniq_test_chains, uniq_chains: 128502135 126067 5077 7163


In [56]:
%%time
path = '../data/moscow_slice/'
val_df.to_parquet(f'{path}/ncf_val_df.parquet')

## Сколько данных тестовой выборки отсеивается в зависимости от interactions

### Clicks + orders: full

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 1987082 1044374 19453

after invalid chains filtering:
df, uniq_users, uniq_chains: 1984220 1043382 19118


### Orders: full

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 1860055 952741 19285

after invalid chains filtering:
df, uniq_users, uniq_chains: 1856314 951570 18664

### Clicks + orders: processed full

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 851440 415267 17272

after invalid chains filtering:
df, uniq_users, uniq_chains: 692249 369776 16446

### Orders: processed moscow

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 483559 212160 11404

after invalid chains filtering:
df, uniq_users, uniq_chains: 341373 172590 5297

### Orders: processed saint-peterburg
initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 164699 62373 5801

after invalid chains filtering:
df, uniq_users, uniq_chains: 87146 46513 1384

### Orders: processed other regions

initial:
df, uniq_users, uniq_chains: 2300001 1253198 19810

after invalid h3 filtering:
df, uniq_users, uniq_chains: 2293762 1249258 19788

after invalid users filtering:
df, uniq_users, uniq_chains: 376063 178911 13700

after invalid chains filtering:
df, uniq_users, uniq_chains: 240902 136697 9381

# Выводы:
* 20% тестовых юзеров нет ни в clicks, ни в orders (cold start);
* 75% тестовых юзеров есть в orders, т.е clicks можно не рассматривать (всего 5%);
* только 30% (!!!!) тестовых юзеров остается после вызова processed_data;
* 48% -- москва, 39% -- регионы, 13% -- спб в orders после вызова processed_data;
* также есть сделать val.drop_duplicates, то отсортируется порядка 30% строк!

### Если h3 пользователя неизвестен, то можно брать следующий в иерархии h3 (более крупный)

In [None]:
val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()

In [21]:
def predict(model, user_id, h3, thr=0.9, top_k=10, filter_already_liked_items=True):
    user_index = interactions.user_index[user_id]
    valid_chains = h3index.h3_to_chains[h3]
    filter_items = [v for k, v in interactions.chain_index.items() if k not in valid_chains]
    top = model.recommend(user_index,
                          interactions.sparse_interaction_matrix.T,
                          N=top_k,
                          filter_already_liked_items=filter_already_liked_items,
                          filter_items=filter_items)
    top = [interactions.r_chain_index[x] for x, score in top if score > thr]
    return top

def old_items(interactions_df, user_id):
    return set(interactions_df[interactions_df['user_id'] == user_id]['chain_id'].unique())

In [22]:
def metric(y_true, y_pred, y_old, at1=10, at2=30, average=True):
    """
    new_prec@10 + new_prec@30 + 1/2 *(prec_@10 + prec@30)
    """
    scores_new = []
    scores_all = []
    scores_total = []
    for t, p, o in zip(y_true, y_pred, y_old):
        t = list(t)
        p = list(p)
        o = o if isinstance(o, (set, list)) else []
        
        prec1 = len(set(t[:at1]) & set(p[:at1])) / at1
        prec2 = len(set(t[:at2]) & set(p[:at2])) / at2
        new_prec1 = len((set(p[:at1]) - set(o)) & set(t[:at1])) / at1
        new_prec2 = len((set(p[:at2]) - set(o)) & set(t[:at2])) / at2

        scores_total.append(new_prec1 + new_prec2 + 0.5 * (prec1 + prec2))
        scores_new.append(new_prec1 + new_prec2)
        scores_all.append(prec1 + prec2)

    return (np.mean(scores_total) if average else scores_total,
            np.mean(scores_new) if average else scores_new,
            np.mean(scores_all) if average else scores_all)

In [None]:
# !pip install implicit
import implicit

def hyper_params(val_df, factors=60, thr=0.7, top_k=30, filter_liked=True):
    print('factors: ', factors, ', thr: ', thr, ', top_k: ', top_k, ', filter_liked: ', filter_liked)
    model = implicit.als.AlternatingLeastSquares(factors=factors)
    model.fit(interactions.sparse_interaction_matrix)
    val = val_df
    val['pred_chains'] = val.apply(lambda x: predict(model, x.user_id, x.h3, thr, top_k, filter_liked), axis=1)
    val['old_chains'] = val.apply(lambda x: old_items(interactions.interaction_df, x.user_id), axis=1)
    scores = metric(val['chain_id'], val['pred_chains'], val['old_chains'])
    print('total, new, all = ', scores)
    print()

In [None]:
hyper_params(val_df, factors=60, thr=0.7, top_k=30, filter_liked=True)

factors:  60 , thr:  0.7 , top_k:  30 , filter_liked:  True

total, new, all =  (0.02605082142811052, 0.00023745918670228555, 0.05162672448281647)

In [None]:
for factors in [30, 40, 50, 60, 70]:
    for thr in [0.7, 0.75, 0.8, 0.85, 0.9]:
        for top_k in [5, 10, 20, 30]:
            for filter_liked in [True, False]:
               hyper_params(val_df, factors, thr, top_k, filter_liked) 

In [98]:
path = '../data/moscow_slice/'
mp_chain_to_index = pd.read_pickle(f'{path}/chain_to_index.pkl')
mp_user_to_index = pd.read_pickle(f'{path}/user_to_index.pkl')
mp_index_to_chain = {v:k for k, v in mp_chain_to_index.items()}
mp_index_to_user = {v:k for k, v in mp_user_to_index.items()}
mp_index_to_h3 = h3index.r_h3_to_index

In [99]:
path = '../data/moscow_slice/'
p_val_df = pd.read_parquet(f'{path}/processed_val_df.parquet')
# p_val_df = p_val_df.rename(columns={"h3_id": "chain_id"})
# p_val_df = p_val_df.rename(columns={"test_chain_id": "h3_id"})
# p_val_df = p_val_df.rename(columns={"chain_id": "test_chain_id"})
print("df, uniq_users, uniq_pred_chains, uniq_test_chains:",
      len(p_val_df), len(p_val_df.user_id.unique()), len(p_val_df.h3_id.unique()),
      len(p_val_df.pred_chain_id.unique()), len(p_val_df.test_chain_id.unique()))
p_val_df.head()

df, uniq_users, uniq_pred_chains, uniq_test_chains: 6066221 126067 12933 1611 5077


Unnamed: 0,user_id,h3_id,pred_chain_id,test_chain_id
0,110673,80785,856,1242
1,110673,80785,856,1320
2,223173,77538,856,1932
3,1844,80371,856,748
4,1844,80371,856,955


In [100]:
%%time
p_val_df["user_id"] = p_val_df["user_id"].map(mp_index_to_user)
p_val_df["h3_id"] = p_val_df["h3_id"].map(mp_index_to_h3)
p_val_df["pred_chain_id"] = p_val_df["pred_chain_id"].map(mp_index_to_chain)
p_val_df["test_chain_id"] = p_val_df["test_chain_id"].map(mp_index_to_chain)
p_val_df = p_val_df.rename(columns={"test_chain_id": "chain_id", "h3_id": "h3"})
print("df, uniq_users, uniq_pred_chains, uniq_test_chains:",
      len(p_val_df), len(p_val_df.user_id.unique()),
      len(p_val_df.pred_chain_id.unique()), len(p_val_df.chain_id.unique()))
p_val_df.head()

df, uniq_users, uniq_pred_chains, uniq_test_chains: 6066221 126067 1611 5077
CPU times: user 985 ms, sys: 237 ms, total: 1.22 s
Wall time: 1.34 s


Unnamed: 0,user_id,h3,pred_chain_id,chain_id
0,8194061,8911aa7966bffff,28720,31698
1,8194061,8911aa7966bffff,28720,32449
2,33560324,8911aa70957ffff,28720,36316
3,56082,8911aa78c6bffff,28720,27490
4,56082,8911aa78c6bffff,28720,29454


In [101]:
%%time
p_val_df = pd.pivot_table(p_val_df,
                        values=['chain_id', 'pred_chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set, 'pred_chain_id': set})
p_val_df = p_val_df.reset_index()
p_val_df = p_val_df.rename(columns={"pred_chain_id": "pred_chains", "chain_id": "chains"})
p_val_df.head()

CPU times: user 6.05 s, sys: 427 ms, total: 6.48 s
Wall time: 6.67 s


Unnamed: 0,user_id,h3,chains,pred_chains
0,0,89118108b43ffff,{28720},"{72544, 38978, 57987, 806, 19998, 25352, 43081..."
1,0,89118134503ffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787..."
2,0,89118134513ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502..."
3,0,89118134517ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502..."
4,0,8911813456bffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787..."


In [104]:
%%time
p_val_df['old_chains'] = p_val_df.apply(lambda x: old_items(ncf_interactions, x.user_id), axis=1)
p_val_df.head()

CPU times: user 4min 33s, sys: 11.9 s, total: 4min 45s
Wall time: 4min 54s


Unnamed: 0,user_id,h3,chains,pred_chains,old_chains
0,0,89118108b43ffff,{28720},"{72544, 38978, 57987, 806, 19998, 25352, 43081...","{32322, 28720, 31057, 35152, 28276}"
1,0,89118134503ffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787...","{32322, 28720, 31057, 35152, 28276}"
2,0,89118134513ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502...","{32322, 28720, 31057, 35152, 28276}"
3,0,89118134517ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502...","{32322, 28720, 31057, 35152, 28276}"
4,0,8911813456bffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787...","{32322, 28720, 31057, 35152, 28276}"


In [106]:
scores = metric(p_val_df['chains'], p_val_df['pred_chains'], p_val_df['old_chains'])
print('total, new, all = ', scores)

total, new, all =  (0.047243359557960045, 0.01741889938024804, 0.05964892035542401)


total, new, all =  (0.05414295764939652, 0.018852887627577093, 0.07058014004363886)

In [108]:
p_val_df.head(10000)

Unnamed: 0,user_id,h3,chains,pred_chains,old_chains
0,0,89118108b43ffff,{28720},"{72544, 38978, 57987, 806, 19998, 25352, 43081...","{32322, 28720, 31057, 35152, 28276}"
1,0,89118134503ffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787...","{32322, 28720, 31057, 35152, 28276}"
2,0,89118134513ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502...","{32322, 28720, 31057, 35152, 28276}"
3,0,89118134517ffff,{28720},"{30244, 30245, 30246, 1929, 29259, 34987, 1502...","{32322, 28720, 31057, 35152, 28276}"
4,0,8911813456bffff,{28720},"{30244, 30245, 30246, 1929, 34987, 28720, 3787...","{32322, 28720, 31057, 35152, 28276}"
...,...,...,...,...,...
9995,167962,8911aa72233ffff,{1929},"{2305, 13698, 25352, 1929, 777, 649, 27147, 31...","{1929, 30396}"
9996,167981,8911aa70e8fffff,"{32449, 31118}","{2305, 13698, 25352, 1929, 777, 649, 29454, 15...","{28720, 36432, 28795}"
9997,167988,8911aa794a7ffff,{44145},"{2305, 13698, 25352, 1929, 777, 649, 13, 29454...","{13, 29454, 32049, 44145, 9435, 52895}"
9998,168037,8911aa6208bffff,"{28720, 52625, 828, 32449}","{2305, 13698, 25352, 1929, 777, 649, 13, 29454...","{30112, 15275, 29580, 23824, 28720, 31698, 48274}"


In [109]:
p_val_df[['h3', 'pred_chains']].head()

Unnamed: 0,h3,pred_chains
0,89118108b43ffff,"{72544, 38978, 57987, 806, 19998, 25352, 43081..."
1,89118134503ffff,"{30244, 30245, 30246, 1929, 34987, 28720, 3787..."
2,89118134513ffff,"{30244, 30245, 30246, 1929, 29259, 34987, 1502..."
3,89118134517ffff,"{30244, 30245, 30246, 1929, 29259, 34987, 1502..."
4,8911813456bffff,"{30244, 30245, 30246, 1929, 34987, 28720, 3787..."


In [113]:
%%time
p_val_df.apply(lambda x: sum([t for t in x.pred_chains if t not in h3index.h3_to_chains[x.h3]]), axis=1).sum()

CPU times: user 41.5 s, sys: 175 ms, total: 41.6 s
Wall time: 42 s


0