In [1]:
import pandas as pd
import utils.utils as utls
import numpy as np
import implicit
from scipy.sparse import csr_matrix, coo_matrix
from baseline import recommend_popular
from implicit.evaluation import mean_average_precision_at_k

In [2]:
dfu = pd.read_csv('../../data/processed_data/customers.csv')
dfi = pd.read_csv('../../data/processed_data/articles.csv', dtype={'article_id': str})
df = pd.read_csv('../../data/processed_data/transactions.csv', dtype={'article_id': str}, parse_dates=['t_dat'])

In [3]:
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 8)

In [4]:
df['t_dat'].max()

Timestamp('2020-09-22 00:00:00')

In [5]:
ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

del dfu, dfi

In [6]:
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train
    

<1362281x104547 sparse matrix of type '<class 'numpy.float64'>'
	with 1190911 stored elements in COOrdinate format>

In [14]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=7):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12


In [12]:
matrices = get_val_matrices(df)

In [15]:
%%time
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")



Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00470
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}




Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00545
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}




Factors:  40 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00542




Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00543




Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00542




Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00499




Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00570
Best MAP@12 found. Updating: {'factors': 50, 'iterations': 12, 'regularization': 0.01}




Factors:  50 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00573
Best MAP@12 found. Updating: {'factors': 50, 'iterations': 14, 'regularization': 0.01}




Factors:  50 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00573




Factors:  50 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00554




Factors:  60 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00478




Factors:  60 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00558




Factors:  60 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00564




Factors:  60 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00573
Best MAP@12 found. Updating: {'factors': 60, 'iterations': 15, 'regularization': 0.01}




Factors:  60 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00582
Best MAP@12 found. Updating: {'factors': 60, 'iterations': 20, 'regularization': 0.01}




Factors: 100 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00562




Factors: 100 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00622
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 12, 'regularization': 0.01}




Factors: 100 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00633
Best MAP@12 found. Updating: {'factors': 100, 'iterations': 14, 'regularization': 0.01}




Factors: 100 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00632




Factors: 100 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00622




Factors: 200 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00671
Best MAP@12 found. Updating: {'factors': 200, 'iterations': 3, 'regularization': 0.01}




Factors: 200 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00663




Factors: 200 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00663




Factors: 200 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00659




Factors: 200 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00649




Factors: 500 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00689
Best MAP@12 found. Updating: {'factors': 500, 'iterations': 3, 'regularization': 0.01}




Factors: 500 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00586




Factors: 500 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00586




Factors: 500 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00576




Factors: 500 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00572




Factors: 1000 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00688




Factors: 1000 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00612




Factors: 1000 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00608




Factors: 1000 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00610




Factors: 1000 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00607
CPU times: user 17h 58min 41s, sys: 16min 24s, total: 18h 15min 5s
Wall time: 2h 27min 59s


In [16]:
del matrices

In [8]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

In [11]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [9]:
best_params = {'factors': 500, 'iterations': 3, 'regularization': 0.01}

In [12]:
model = train(coo_train, **best_params)



  0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
def submit(model, csr_train, submission_name="../../data/submissions/als_submission.csv"):
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, _ = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
        for i, userid in enumerate(batch):
            customer_id = user_ids[userid]
            user_items = ids[i]
            article_ids = [item_ids[item_id] for item_id in user_items]
            preds.append(customer_id, ' '.join(article_ids))

    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [17]:
%%time
df_preds = submit(model, csr_train);

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043 858856005 779781015 869331002 748355...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,112679048 111609001 111593001 111586001 111565...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007 794321011 886737001 866610001 765743...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,112679048 111609001 111593001 111586001 111565...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,112679048 111609001 111593001 111586001 111565...


(1362281, 2)
CPU times: user 2h 32min, sys: 9min 48s, total: 2h 41min 49s
Wall time: 21min 13s


In [19]:
sub = pd.read_csv('../../data/sample_submission.csv')

In [21]:
sub = sub[['customer_id']].merge(df_preds, on='customer_id', how='left')

In [23]:
sub = sub.fillna("112679048 111609001 111593001 111586001 111565003 111565001 110065011 110065002 110065001 108775051 108775044 108775015")

In [30]:
sub.to_csv("../../data/submissions/als_submission.csv", index=False)

In [29]:
sub["prediction"] = sub["prediction"].apply(lambda x: ' '.join(map(lambda y: "0" + y, x.split(sep=" "))))