# Full pipeline применение

In [1]:
import pickle
from top_recommender import TopRecommender
from h3_index import H3Index
import pandas as pd
import numpy as np

In [2]:
# and later you can load it

with open('lightfm_moscow.pkl', 'rb') as f:
    lightfm_moscow = pickle.load(f)
with open('interactions_moscow.pkl', 'rb') as f:
    interactions_moscow = pickle.load(f)
with open('user_features_sparse_moscow.pkl', 'rb') as f:
    user_features_sparse_moscow = pickle.load(f)
    
# and later you can load it
with open('lightfm_piter.pkl', 'rb') as f:
    lightfm_piter = pickle.load(f)
with open('interactions_piter.pkl', 'rb') as f:
    interactions_piter = pickle.load(f)
with open('user_features_sparse_piter.pkl', 'rb') as f:
    user_features_sparse_piter = pickle.load(f)
    
# and later you can load it
with open('lightfm_other.pkl', 'rb') as f:
    lightfm_other = pickle.load(f)
with open('interactions_other.pkl', 'rb') as f:
    interactions_other = pickle.load(f)
with open('user_features_sparse_other.pkl', 'rb') as f:
    user_features_sparse_other = pickle.load(f)
    

with open('top_rec.pkl', 'rb') as f:
    top_rec = pickle.load(f)

In [3]:
#!pip install fastparquet
h3index = H3Index('../data/raw/h3_to_chains.pkl')

In [4]:
val_df = pd.read_pickle('../data/raw/test_VALID.pkl')
val_df = val_df.rename(columns={"customer_id": "user_id"})
val_df.user_id = val_df.user_id.astype(int)

print("Initial validation dataset size:", len(val_df))
val_df = val_df[val_df["h3"].isin(h3index.valid)]
print("Filter h3 indices that not in h3_to_chain dict", len(val_df))

val_df = pd.pivot_table(val_df,
                        values=['chain_id'],
                        index=['user_id', 'h3', 'city_id'],
                        aggfunc={'chain_id': set})
val_df = val_df.reset_index()
val_df.head()

Initial validation dataset size: 2300001
Filter h3 indices that not in h3_to_chain dict 2293762


Unnamed: 0,user_id,h3,city_id,chain_id
0,0,890b0638003ffff,49,{34646}
1,0,890b0638007ffff,49,{34646}
2,0,890b063800fffff,49,{34646}
3,0,890b0638023ffff,49,{34646}
4,0,890b0638027ffff,49,{34646}


In [5]:
def predict_in_city(lightfm, top_rec, user_id, h3, interactions, user_features_sparse, top_k=10):
    valid_chains = h3index.h3_to_chains[h3]
    if user_id in interactions.user_to_index:
        user_index = interactions.user_to_index[user_id]
        valid_chain_index = [v for k, v in interactions.chain_to_index.items() if k in valid_chains]
        pred = lightfm.predict(user_index, valid_chain_index, user_features=user_features_sparse)
        top_chain_index = [x for _, x in sorted(zip(pred, valid_chain_index), reverse=True)][:top_k]
        top = [interactions.index_to_chain[k] for k in top_chain_index]
    else:
        pred = top_rec.predict(valid_chains)
        top = [x for _, x in sorted(zip(pred, valid_chains), reverse=True)][:top_k]
    return top

def old_items_in_city(user_id, interactions):
    return set(interactions.interaction_df[interactions.interaction_df['user_id'] == user_id]['chain_id'].unique())

In [6]:
def predict(user_id, h3, city_id, top_k=10):
    if city_id == 1:
        top = predict_in_city(
            lightfm_moscow,
            top_rec,
            user_id,
            h3,
            interactions_moscow,
            user_features_sparse_moscow,
            top_k=top_k
        )
    elif city_id == 2:
        top = predict_in_city(
            lightfm_piter,
            top_rec,
            user_id,
            h3,
            interactions_piter,
            user_features_sparse_piter,
            top_k=top_k
        )
    else:
        top = predict_in_city(
            lightfm_other,
            top_rec,
            user_id,
            h3,
            interactions_other,
            user_features_sparse_other,
            top_k=top_k
        ) 
    return top
        
def old_items(user_id, city_id):
    if city_id ==1:
        old = old_items_in_city(user_id, interactions_moscow)
    elif city_id == 2:
        old = old_items_in_city(user_id, interactions_piter)
    else:
        old = old_items_in_city(user_id, interactions_other)
    
    return old

In [7]:
def metric(y_true, y_pred, y_old, at1=10, at2=30, average=True):
    """
    new_prec@10 + new_prec@30 + 1/2 *(prec_@10 + prec@30)
    """
    scores_new = []
    scores_all = []
    scores_total = []
    for t, p, o in zip(y_true, y_pred, y_old):
        t = list(t)
        p = list(p)
        o = o if isinstance(o, (set, list)) else []
        
        prec1 = len(set(t[:at1]) & set(p[:at1])) / at1
        prec2 = len(set(t[:at2]) & set(p[:at2])) / at2
        new_prec1 = len((set(p[:at1]) - set(o)) & set(t[:at1])) / at1
        new_prec2 = len((set(p[:at2]) - set(o)) & set(t[:at2])) / at2

        scores_total.append(new_prec1 + new_prec2 + 0.5 * (prec1 + prec2))
        scores_new.append(new_prec1 + new_prec2)
        scores_all.append(prec1 + prec2)

    return (np.mean(scores_total) if average else scores_total,
            np.mean(scores_new) if average else scores_new,
            np.mean(scores_all) if average else scores_all)

In [8]:
def compute_score(val_df, frac=0.001, top_k=30):
    val = val_df.sample(frac=frac, random_state=42)
    val['pred_chains'] = val.apply(lambda x: predict(x.user_id, x.h3, x.city_id, top_k), axis=1)
    val['old_chains'] = val.apply(lambda x: old_items(x.user_id, x.city_id), axis=1)
    scores = metric(val['chain_id'], val['pred_chains'], val['old_chains'])
    print('total, new, all = ', scores)
    print()

In [10]:
%%time
compute_score(val_df, frac=1.)

total, new, all =  (0.1321230585029217, 0.08749163535664047, 0.08926284629256238)

CPU times: user 17min 5s, sys: 1.06 s, total: 17min 6s
Wall time: 17min 6s
