# Alternating Least Square (ALS)

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from interaction_table import orders_weigher, InteractionTable
from h3_index import H3Index

In [2]:
#!pip install fastparquet
h3index = H3Index('../data/raw/h3_to_chains.pkl')

In [3]:
def get_clicks():
    path = '../data/click.parquet'
    return pd.read_parquet(path)

def get_orders():
    path = '../data/orders.parquet'
    df = pd.read_parquet(path)
    df = df.rename(columns={"customer_id": "user_id"})
    return df

In [4]:
def orders_weigher_without_normalization(orders_df):
    return orders_weigher(orders_df, False)

def orders_weigher_with_normalization(orders_df):
    return orders_weigher(orders_df, True)

In [22]:
interactions = InteractionTable(None, get_orders,
                                None, orders_weigher_without_normalization,
                                alpha=0, test_slice=int(1e5))

Orders df loaded: size=15797147,  uniq_users=3385197,  uniq_chains=23890
Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  8.011387e+06  8.011387e+06  8.011387e+06
mean   4.043872e+07  3.385702e+04  1.971837e+00
std    2.132381e+07  1.560430e+04  6.286035e+01
min    0.000000e+00  9.000000e+00  1.000000e+00
25%    2.925719e+07  2.780800e+04  1.000000e+00
50%    4.542840e+07  3.105700e+04  1.000000e+00
75%    5.577489e+07  4.385200e+04  2.000000e+00
max    7.213902e+07  7.332400e+04  1.690410e+05
Orders df weighted: size=8011387, uniq_users=3385197, uniq_chains=23890
Interaction df len for test:  236862


In [23]:
val_df = pd.read_pickle('../data/raw/test_VALID.pkl')
val_df = val_df[['customer_id', 'h3', 'chain_id']]
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)
print(len(val_df))
val_df = val_df.query('h3 in @h3index.valid')
print(len(val_df))
val_df = val_df.query('user_id in @interactions.user_index')
print(len(val_df))
val_df = val_df.query('chain_id in @interactions.chain_index')
print(len(val_df))
val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()
val_df.head()

2300001
2293762
23954
18959


Unnamed: 0,user_id,h3,chain_id
0,392,891181b655bffff,{28795}
1,538,8911aa79667ffff,{39232}
2,600,8911aa44d53ffff,{2046}
3,1695,8911aa7a0a7ffff,{4363}
4,10053,8911aa45d6fffff,{28720}


### Если h3 пользователя неизвестен, то можно брать следующий в иерархии h3 (более крупный)

In [24]:
def predict(model, user_id, h3, thr=0.9, top_k=10, filter_already_liked_items=True):
    user_index = interactions.user_index[user_id]
    valid_chains = h3index.h3_to_chains[h3]
    filter_items = [v for k, v in interactions.chain_index.items() if k not in valid_chains]
    top = model.recommend(user_index,
                          interactions.sparse_interaction_matrix.T,
                          N=top_k,
                          filter_already_liked_items=filter_already_liked_items,
                          filter_items=filter_items)
    top = [interactions.r_chain_index[x] for x, score in top if score > thr]
    return top

def old_items(user_id):
    return set(interactions.interaction_df[interactions.interaction_df['user_id'] == user_id]['chain_id'].unique())

In [25]:
def metric(y_true, y_pred, y_old, at1=10, at2=30, average=True):
    """
    new_prec@10 + new_prec@30 + 1/2 *(prec_@10 + prec@30)
    """
    scores_new = []
    scores_all = []
    scores_total = []
    for t, p, o in zip(y_true, y_pred, y_old):
        t = list(t)
        p = list(p)
        o = o if isinstance(o, (set, list)) else []
        
        prec1 = len(set(t[:at1]) & set(p[:at1])) / at1
        prec2 = len(set(t[:at2]) & set(p[:at2])) / at2
        new_prec1 = len((set(p[:at1]) - set(o)) & set(t[:at1])) / at1
        new_prec2 = len((set(p[:at2]) - set(o)) & set(t[:at2])) / at2

        scores_total.append(new_prec1 + new_prec2 + 0.5 * (prec1 + prec2))
        scores_new.append(new_prec1 + new_prec2)
        scores_all.append(prec1 + prec2)

    return (np.mean(scores_total) if average else scores_total,
            np.mean(scores_new) if average else scores_new,
            np.mean(scores_all) if average else scores_all)

In [26]:
# !pip install implicit
import implicit

def hyper_params(val_df, factors=60, thr=0.7, top_k=30, filter_liked=True):
    print('factors: ', factors, ', thr: ', thr, ', top_k: ', top_k, ', filter_liked: ', filter_liked)
    model = implicit.als.AlternatingLeastSquares(factors=factors, use_gpu=True, random_state=42)
    model.fit(interactions.sparse_interaction_matrix)
    val = val_df
    val['pred_chains'] = val.apply(lambda x: predict(model, x.user_id, x.h3, thr, top_k, filter_liked), axis=1)
    val['old_chains'] = val.apply(lambda x: old_items(x.user_id), axis=1)
    scores = metric(val['chain_id'], val['pred_chains'], val['old_chains'])
    print('total, new, all = ', scores)
    print()

In [27]:
%%time
hyper_params(val_df, factors=60, thr=0.7, top_k=30, filter_liked=True)

factors:  60 , thr:  0.7 , top_k:  30 , filter_liked:  True


  0%|          | 0/15 [00:00<?, ?it/s]

total, new, all =  (0.023462411347517732, 0.0002836879432624114, 0.046357446808510644)

CPU times: user 56.6 s, sys: 27.8 ms, total: 56.6 s
Wall time: 56.6 s


factors:  60 , thr:  0.7 , top_k:  30 , filter_liked:  True

total, new, all =  (0.02605082142811052, 0.00023745918670228555, 0.05162672448281647)

In [None]:
for factors in [30, 40, 50, 60, 70]:
    for thr in [0.7, 0.75, 0.8, 0.85, 0.9]:
        for top_k in [5, 10, 20, 30]:
            for filter_liked in [True, False]:
                hyper_params(val_df, factors, thr, top_k, filter_liked) 