In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem

DATA_PATH = '../../data'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(f'{DATA_PATH}/transactions_train.csv', parse_dates=['t_dat'])
df['ts'] = (pd.to_datetime(df['t_dat']).astype(np.int64) // 10**9).astype(np.int32)
df = df[['article_id', 'customer_id', 'ts', 't_dat']]

In [3]:
customers_df = pd.read_csv(f'{DATA_PATH}/customers.csv')
CUSTOMERS = customers_df['customer_id'].unique().tolist()
del customers_df

In [4]:
last_date = df.t_dat.max()

# Engineered feature 1: item popularity, weighted by amount of weeks since last purchase
df['weeks_ago'] = (last_date - df.t_dat).dt.days // 7 + 1

# calculate the popularity of each article_id: sum of the number of times it was purchased each week, weighted by weeks_ago
popularity = df.groupby(['article_id', 'weeks_ago']).size().reset_index(name='purchase_count').groupby(
    'article_id').apply(lambda x: np.sum(x.purchase_count / x.weeks_ago)).reset_index(name='article_popularity')

# get 12 most popular items
most_popular_articles = popularity.nlargest(12, 'article_popularity')['article_id'].tolist()

df.drop(columns=['weeks_ago'], inplace=True)

# Candidate Generation

In [5]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='ts')
proc.add_filter(MinUsersPerItem(5, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(50, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(df[['article_id', 'customer_id', 'ts']])

100%|██████████| 15795409/15795409 [00:09<00:00, 1707298.73it/s]
100%|██████████| 15795409/15795409 [00:08<00:00, 1795973.70it/s]


In [6]:
from recpack.algorithms import ItemKNN
knn = ItemKNN(K=20, similarity='cosine')
knn.fit(interaction_matrix)

2022-11-12 11:26:10,475 - base - recpack - INFO - Fitting ItemKNN complete - Took 60.7s


In [7]:
predictions = knn.predict(interaction_matrix)

In [8]:
def top_n_idx_sparse(matrix, n):
    """Return index of top n values in each row of a sparse matrix.
    source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    """
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist())
    return top_n_idx

In [9]:
# get top 12 item indices for each user
pred_indices = top_n_idx_sparse(predictions, 12)

In [10]:
# create a dict with customer_id as key and an empty list as value
prediction_dict = {customer_id: most_popular_articles for customer_id in CUSTOMERS}

# use interaction_matrix._df to map back to original customer and article ids
uid_cid_map = interaction_matrix._df[["uid", "customer_id"]].drop_duplicates().set_index("uid").to_dict()["customer_id"]
iid_aid_map = interaction_matrix._df[["iid", "article_id"]].drop_duplicates().set_index("iid").to_dict()["article_id"]

In [11]:
for i, row in enumerate(pred_indices):
    user_predictions = [iid_aid_map[iid] for iid in row]
    if len(user_predictions) < 12:
        # add all most popular articles that are not already in the list
        user_predictions.extend([aid for aid in most_popular_articles if aid not in user_predictions])[:12]
    prediction_dict[uid_cid_map[i]] = user_predictions

In [12]:
# create a dataframe with the predictions
df_pred = pd.DataFrame({
    'customer_id': list(prediction_dict.keys()), 
    'prediction': list(" ".join(["0" + str(article) for article in articles]) for articles in prediction_dict.values())
})

In [13]:
df_pred.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0751471001 0448509014 0610776002 03...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0688537005 0559630026 0559616014 0811925005 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0751471001 0448509014 0610776002 03...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0751471001 0448509014 0610776002 03...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0751471001 0448509014 0610776002 03...


In [14]:
df_pred.to_csv('submission.zip', index=False)