In [1]:
import os

os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ["OMP_NUM_THREADS"] = '1'
os.environ["MKL_NUM_THREADS"] = '1'
os.environ["VECLIB_MAXIMUM_THREADS"] = '1'
os.environ["NUMEXPR_NUM_THREADS"] = '1'

In [2]:
import pandas as pd
import numpy as np
import implicit
import matplotlib.pyplot as plt


from implicit.nearest_neighbours import bm25_weight
from scipy.sparse import csr_matrix
from implicit.evaluation import train_test_split, ndcg_at_k, precision_at_k

%matplotlib inline

In [3]:
df = pd.read_csv('data_made_restaurants.csv', )

In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df = df[df['customer_id'] != -10].reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,customer_id,order_id,user_latitude,user_longitude,date,city_id,chain_id,vendor_id,target,total_value,...,vendor_latitude,vendor_longitude,online_payment,accepting_cash,min_delivery_value,takeaway_support,citymobil_support,default_product_group_id,product_group_ids,cuisine_ids
0,15955880,207845807,55.7815,37.5307,2020-08-01,1,140718,343852,1,575,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
1,62512097,207855295,55.6472,37.4682,2020-08-01,1,140718,343852,1,1360,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
2,64977556,207871966,55.8649,37.5014,2020-08-01,1,140718,343852,1,560,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
3,81281415,207960541,55.8711,37.5105,2020-08-01,1,140718,343852,1,1130,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],
4,72045218,208028305,55.8166,37.5899,2020-08-01,1,140718,343852,1,745,...,55.778137,37.6024,1,0,1000,1,0,30.0,[30],


In [6]:
df['date'] = pd.to_datetime(df['date'])
df['week'] = (df['date'] - df['date'].min()).dt.days // 7
df['day_of_week'] = df['date'].dt.dayofweek

In [7]:
def split_by_folds(df):
    folds = {
        'previous': df[df['week'] <= 3].index.tolist(),
        'test': df[df['week'] >= 11].index.tolist()
    }
    
    for fold_num, week in enumerate(range(4, 7)):
        fold_train = df[(df['week'] >= week) & (df['week'] < week + 4)]
        folds[f'train_{fold_num + 1}'] = fold_train.index.tolist()
        
        fold_val = df[(df['week'] == week + 4) 
                      & (~df['customer_id'].isin(fold_train['customer_id']))] # exclude train customers?
        folds[f'val_{fold_num + 1}'] = fold_val.index.tolist()
        
    return folds

In [8]:
folds = split_by_folds(df)

In [22]:
params = {
    'n_components': 100,
    'bm25_params': {
        'K1': 100,
        'B': 0.8,
    },
    'seed': 42,
    'scale': 100,
    'num_threads': 2,
    'num_iter': 10,
    'regularization': 1
}

def create_item_user_matrix(df, params):
    data = df[['customer_id', 'chain_id', 'target']].copy()
    
    unique_rows = list(np.unique(data['customer_id']).astype(np.int32))
    unique_cols = list(np.unique(data['chain_id']).astype(np.int32))

    row_map = dict(zip(unique_rows, range(len(unique_rows))))
    col_map = dict(zip(unique_cols, range(len(unique_cols))))

    data['customer_id'] = data['customer_id'].map(row_map)
    data['chain_id'] = data['chain_id'].map(col_map)

    matrix = csr_matrix((data['target'], [data['customer_id'], data['chain_id']]))
    matrix = bm25_weight(matrix, **params['bm25_params']).T.tocsr() * params['scale']
    
    return matrix

def compute_als_decomposition(matrix, params):
    np.random.seed(params['seed'])
    implicit_als = implicit.als.AlternatingLeastSquares(
        params['n_components'], num_threads=params['num_threads'], iterations=params['num_iter'],
        regularization=params['regularization']
    )
    implicit_als.fit(matrix)

    return implicit_als

In [23]:
prev_df = df.loc[folds['previous']].groupby(['customer_id', 'chain_id'])['target'].max().reset_index()
prev_matrix = create_item_user_matrix(prev_df, params)

train_m, val_m = train_test_split(prev_matrix, 0.7)

In [16]:
implicit_als = compute_als_decomposition(train_m, params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [28]:
precision_at_k(implicit_als, train_m.T, val_m.T, K=15)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=583036.0), HTML(value='')))




0.08738984933288518

In [26]:
ndcg_at_k(implicit_als, train_m.T, val_m.T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=583036.0), HTML(value='')))




0.04810541178749741