# Alternating Least Square (ALS)

https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea

https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-2-alternating-least-square-als-matrix-4a76c58714a1

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from interaction_table import InteractionTable

In [2]:
#!pip install fastparquet

In [3]:
def get_clicks():
    path = '../data/clicks/click.parquet'
    return pd.read_parquet(path)

def get_orders():
    path = '../data/orders/orders.parquet'
    df = pd.read_parquet(path)
    df = df.rename(columns={"customer_id": "user_id"})
    return df

In [4]:
def orders_weigher(orders_df):
    """
    (chain, user) -> interaction weight
    """
    print('Orders weighter: use user avg orders per chain as weight')
    orders_df['weight'] = 1
    orders_df = orders_df[['user_id', 'chain_id', 'weight']]
    
    total_user_stats = orders_df.groupby(['user_id']).sum()
    total_user_stats = total_user_stats.reset_index()[['user_id', 'weight']]
    assert total_user_stats.weight.isnull().sum() == 0

    user_chain_stats = orders_df.groupby(['user_id', 'chain_id']).sum()
    user_chain_stats = user_chain_stats.reset_index()[['user_id', 'chain_id', 'weight']]
    assert sorted(total_user_stats.user_id.unique()) == sorted(user_chain_stats.user_id.unique())
    assert user_chain_stats.weight.isnull().sum() == 0
    
    user_chain_stats = user_chain_stats.merge(total_user_stats, left_on='user_id', right_on='user_id', suffixes=('_per_chain', '_total'))
    user_chain_stats['weight_per_chain'] /= user_chain_stats['weight_total']
    user_chain_stats = user_chain_stats.rename(columns={'user_id_per_chain': 'user_id', 'weight_per_chain': 'weight'})
    orders_df = user_chain_stats[['user_id', 'chain_id', 'weight']]
    assert len(orders_df) == len(user_chain_stats)
    assert orders_df.weight.isnull().sum() == 0
    print(orders_df.describe())
    
    # post-processing
    # orders_df['weight'] = 1 + np.log(1 + orders_df.weight) * 10
    return orders_df

In [5]:
interactions = InteractionTable(None, get_orders, None, orders_weigher, 0)

Orders df loaded: size=14862643,  uniq_users=3973431,  uniq_chains=23887
Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  8.499794e+06  8.499794e+06  8.499794e+06
mean   3.989632e+07  3.527085e+04  4.674738e-01
std    2.128882e+07  1.553233e+04  3.516978e-01
min    0.000000e+00  9.000000e+00  3.270710e-06
25%    2.897376e+07  2.872000e+04  1.666667e-01
50%    4.474124e+07  3.204900e+04  3.333333e-01
75%    5.502611e+07  4.660900e+04  1.000000e+00
max    7.213902e+07  7.332400e+04  1.000000e+00
Orders df weighted: size=8499794, uniq_users=3973431, uniq_chains=23887


### Если h3 пользователя неизвестен, то можно брать следующий в иерархии h3 (более крупный)

In [6]:
# !pip install implicit
import implicit

In [7]:
model = implicit.als.AlternatingLeastSquares(factors=50)



In [8]:
model.fit(interactions.sparse_interaction_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [33]:
class H3Index:
    def __init__(self, h3_to_chains_path):
        self.h3_to_chains = pd.read_pickle(h3_to_chains_path)
        self.valid = set([x for x in self.h3_to_chains.keys()])
    
    def filter_by(self, h3, chains):
        return self.h3_to_chains[h3].intersection(chains)

h3index = H3Index('../data/h3_to_chains.pkl')

In [66]:
val_df = pd.read_pickle('../data/test_VALID.pkl')
val_df = val_df[['customer_id', 'h3', 'chain_id']]
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)
val_df.head()

Unnamed: 0,user_id,h3,chain_id
0,386249,8911aa7aa1bffff,29454
1,45090350,8911aa71a57ffff,48274
2,59217420,8911aa4c867ffff,45822
3,37528887,89119ec140fffff,41891
4,28001897,8925716ed0bffff,42181


In [67]:
print(len(val_df))
val_df = val_df.query('h3 in @h3index.valid')
print(len(val_df))
val_df = val_df.query('user_id in @interactions.user_index')
print(len(val_df))
val_df = val_df.query('chain_id in @interactions.chain_index')
print(len(val_df))

2300001
2293762
1860055
1856314


In [68]:
val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()
val_df.head()

Unnamed: 0,user_id,h3,chain_id
0,0,890b0638003ffff,{34646}
1,0,890b0638007ffff,{34646}
2,0,890b063800fffff,{34646}
3,0,890b0638023ffff,{34646}
4,0,890b0638027ffff,{34646}


In [136]:
from collections import defaultdict

test_df = val_df.head(10)

def predict(user_id, h3, thr=0.0, top_k=30, filter_already_liked_items=False):
    user_index = interactions.user_index[user_id]
    top = model.recommend(user_index,
                          interactions.sparse_interaction_matrix.T,
                          N=top_k,
                          filter_already_liked_items=filter_already_liked_items)
    top = [interactions.r_chain_index[x] for x, score in top if score > thr]
    return h3index.filter_by(h3, top)

def func(row):
    return predict(row.user_id, row.h3)

In [137]:
%%time
test_df['pred_chains'] = test_df.apply(lambda x: func(x), axis=1)

CPU times: user 39.2 ms, sys: 9.58 ms, total: 48.7 ms
Wall time: 14.6 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [138]:
%%time
val_df['pred_chains'] = val_df.apply(lambda x: func(x), axis=1)

CPU times: user 174 ms, sys: 57.4 ms, total: 231 ms
Wall time: 74.6 ms


In [61]:
34646 in h3index.h3_to_chains['890b0638003ffff']

True

In [13]:
def metric(y_true, y_pred, y_old, at1=10, at2=30, average=True):
    """
    new_prec@10 + new_prec@30 + 1/2 *(prec_@10 + prec@30)
    """
    scores = []
    for t, p, o in zip(y_true, y_pred, y_old):
        t = list(t)
        p = list(p)
        o = o if isinstance(o, (set, list)) else []
        
        prec1 = len(set(t[:at1]) & set(p[:at1])) / at1
        prec2 = len(set(t[:at2]) & set(p[:at2])) / at2
        new_prec1 = len((set(p[:at1]) - set(o)) & set(t[:at1])) / at1
        new_prec2 = len((set(p[:at2]) - set(o)) & set(t[:at2])) / at2

        scores.append(new_prec1 + new_prec2 + 0.5*(prec1 + prec2))

    return np.mean(scores) if average else scores

In [14]:
metric([{49344, 32449, 13698, 27490, 28720}],
       [{49344, 32449, 13698, 27490, 28720, 23376, 16277, 26878}],
       [{49344, 32449, 13698, 27490, 28720, 23376, 16277, 26878}])

0.3333333333333333