# LightFM

In [1]:
from lightfm import LightFM
import implicit

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from interaction_table import InteractionTable
from h3_index import H3Index

In [3]:
from process_data import preprocess_orders_and_clicks, additional_filtration_orders_and_clicks
from user_features import generate_user_features

In [4]:
orders = pd.read_parquet("../data/orders_filtered.parquet")
user_features = pd.read_parquet("../data/user_features.parquet")

In [5]:
interactions = InteractionTable(orders, None, alpha=0, test_slice=100000)

Orders weighter: use user avg orders per chain as weight
            user_id      chain_id        weight
count  3.106486e+06  3.106486e+06  3.106486e+06
mean   3.666636e+07  3.212015e+04  1.755490e+00
std    2.148159e+07  1.517362e+04  8.203714e+01
min    0.000000e+00  9.000000e+00  1.000000e+00
25%    1.143635e+07  2.714700e+04  1.000000e+00
50%    3.991074e+07  3.007500e+04  1.000000e+00
75%    5.175972e+07  4.451900e+04  2.000000e+00
max    7.213893e+07  7.332400e+04  1.444470e+05
Orders df weighted: size=3106486, uniq_users=1394062, uniq_chains=7792
Interaction df len for test:  222705


In [6]:
import scipy

In [7]:
# возьмем все фичи и сравним со средним
user_features_sparse = scipy.sparse.csr_matrix(
    (
        user_features.loc[interactions.user_to_index.keys()] 
        - user_features.loc[interactions.user_to_index.keys()].mean()
        > 0
    ).astype(int)
)

In [8]:
user_features_sparse

<100000x24 sparse matrix of type '<class 'numpy.int64'>'
	with 659292 stored elements in Compressed Sparse Row format>

In [9]:
#!pip install fastparquet
h3index = H3Index('../data/raw/h3_to_chains.pkl')

In [10]:
val_df = pd.read_pickle('../data/raw/test_VALID.pkl')
val_df = val_df[['customer_id', 'h3', 'chain_id']]
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)
print("Initial validation dataset size:", len(val_df))
val_df = val_df[val_df["h3"].isin(h3index.valid)]
print("Filter h3 indices that not in h3_to_chain dict", len(val_df))
val_df = val_df[val_df["user_id"].isin(interactions.user_to_index)]
print("Filter users", len(val_df))
val_df = val_df[val_df["chain_id"].isin(interactions.chain_to_index)]
print("Filter chains", len(val_df))
val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()
val_df.head()

Initial validation dataset size: 2300001
Filter h3 indices that not in h3_to_chain dict 2293762
Filter users 31104
Filter chains 22398


Unnamed: 0,user_id,h3,chain_id
0,538,8911aa79667ffff,{39232}
1,600,8911aa44d53ffff,{2046}
2,1342,8911aa09b4bffff,{28720}
3,2058,8911aa7983bffff,{197}
4,3852,8911aa73473ffff,"{51927, 7647}"


In [11]:
from typing import List
from collections import defaultdict

class TopRecommender:
    def __init__(self, status_id: List[int] = [11, 18]):
        self.status_id = status_id
        
    def fit(self, orders: pd.DataFrame):
        self.chains_to_cnt = (
            orders[orders.status_id.isin(self.status_id)]
            .groupby("chain_id", sort=False)["order_id"]
            .size()
            .to_dict()
        )
        self.chains_to_cnt = defaultdict(int, self.chains_to_cnt)
        
        return self

    def predict(self, chain_ids: List[int]):
        
        return [self.chains_to_cnt[chain_id] for chain_id in chain_ids]
    
top_model = TopRecommender()
top_model = top_model.fit(orders)

In [12]:
light_model = LightFM(loss='warp', user_alpha=0.1)
light_model.fit(
    interactions.sparse_interaction_matrix.T, 
    user_features=user_features_sparse, 
    epochs=60, num_threads=2
)

<lightfm.lightfm.LightFM at 0x7f0ba7b8d8b0>

In [14]:
als_model = implicit.als.AlternatingLeastSquares(factors=30, use_gpu=False, random_state=42)
als_model.fit(interactions.sparse_interaction_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [15]:
def predict_light(model, user_id, h3, top_k=10):
    user_index = interactions.user_to_index[user_id]
    valid_chains = h3index.h3_to_chains[h3]
    valid_chain_index = [v for k, v in interactions.chain_to_index.items() if k in valid_chains]
    pred = model.predict(user_index, valid_chain_index, user_features=user_features_sparse)
    top_chain_index = [x for _, x in sorted(zip(pred, valid_chain_index), reverse=True)][:top_k]
    top = [interactions.index_to_chain[k] for k in top_chain_index]
    return top

def predict_als(model, user_id, h3, thr=0.7, top_k=5, filter_already_liked_items=True):
    user_index = interactions.user_to_index[user_id]
    valid_chains = h3index.h3_to_chains[h3]
    filter_items = [v for k, v in interactions.chain_to_index.items() if k not in valid_chains]
    top = model.recommend(user_index,
                          interactions.sparse_interaction_matrix.T,
                          N=top_k,
                          filter_already_liked_items=filter_already_liked_items,
                          filter_items=filter_items)
    top = [interactions.r_chain_index[x] for x, score in top if score > thr]
    return top

def predict_top(model, user_id, h3, top_k=10):
    valid_chains = h3index.h3_to_chains[h3]
    pred = pred = model.predict(valid_chains)
    top = [x for _, x in sorted(zip(pred, valid_chains), reverse=True)][:top_k]
    return top

def old_items(user_id):
    return set(interactions.interaction_df[interactions.interaction_df['user_id'] == user_id]['chain_id'].unique())

In [16]:
chains = pd.read_pickle("../data/raw/chains.pkl")
chains = chains[chains.chain_id.notna()]
chains["chain_id"] = chains["chain_id"].astype(int)
chain_id_to_name = chains.set_index("chain_id")["chain_name"].to_dict()

In [28]:
res = pd.DataFrame(columns=["user_id", "h3", "model", "old_items", "prediction"])

In [29]:
for user_id in val_df.user_id.sample(100, random_state=42).unique()[:30]:
    for h3 in val_df.h3.sample(100, random_state=4).unique()[:10]:
        if len(old_items(user_id)) > 4:
            old_items_list = [chain_id_to_name[i] for i in old_items(user_id)]
            als_pred = [
                chain_id_to_name[i] 
                for i in 
                predict_als(als_model, user_id, h3, thr=0, top_k=30, filter_already_liked_items=False)
            ]
            light_pred = [
                chain_id_to_name[i] 
                for i in 
                predict_light(light_model, user_id, h3, top_k=30)
            ]
            top_pred = [
                chain_id_to_name[i]
                for i in
                predict_top(top_model, user_id, h3, top_k=30)
            ]
            if len(als_pred) + len(light_pred) + len(top_pred) < 90:
                continue
            res.loc[-1] = [user_id, h3, "als", old_items_list, als_pred]  # adding a row
            res.index = res.index + 1  # shifting index
            res = res.sort_index()  # sorting by index
            
            res.loc[-1] = [user_id, h3, "lightfm", old_items_list, light_pred]  # adding a row
            res.index = res.index + 1  # shifting index
            res = res.sort_index()  # sorting by index
            
            res.loc[-1] = [user_id, h3, "top_rec", old_items_list, top_pred]  # adding a row
            res.index = res.index + 1  # shifting index
            res = res.sort_index()  # sorting by index

In [31]:
res.user_id.nunique()

10

In [30]:
res.shape

(300, 5)

In [33]:
res.to_parquet("results_30.parquet")

In [83]:
res = pd.read_parquet("results.parquet")

In [23]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 40)

In [32]:
res.head()

Unnamed: 0,user_id,h3,model,old_items,prediction
0,1723425,8911aa6ac47ffff,top_rec,"[PIZZASUSHIWOK, Крошка Картошка, Subway, Терем...","[Макдоналдс, KFC, Burger King, Кухня на районе..."
1,1723425,8911aa6ac47ffff,lightfm,"[PIZZASUSHIWOK, Крошка Картошка, Subway, Терем...","[Макдоналдс, KFC, Burger King, Domino’s Pizza,..."
2,1723425,8911aa6ac47ffff,als,"[PIZZASUSHIWOK, Крошка Картошка, Subway, Терем...","[PIZZASUSHIWOK, Subway, KFC, Макдоналдс, Терем..."
3,1723425,8911aa7ab0fffff,top_rec,"[PIZZASUSHIWOK, Крошка Картошка, Subway, Терем...","[Макдоналдс, KFC, Burger King, Кухня на районе..."
4,1723425,8911aa7ab0fffff,lightfm,"[PIZZASUSHIWOK, Крошка Картошка, Subway, Терем...","[Макдоналдс, KFC, Burger King, Кухня на районе..."
