In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.algorithms import ItemKNN, TARSItemKNN, Popularity
from recpack.matrix import InteractionMatrix

# import utils file from previous lecture
import sys
sys.path.append('../lecture4')
from utils import DATA_PATH, customer_hex_id_to_int

In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
# customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
# articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > test_week - 20]

# Preprocessing + set up scenario

In [4]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(20, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(20, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

  0%|          | 0/2056597 [00:00<?, ?it/s]

  0%|          | 0/2056597 [00:00<?, ?it/s]

# Generating baseline

### Most popular articles

In [5]:
popularity = Popularity(K=12)
popularity.fit(interaction_matrix)
popular_item_ids = [id for id, _ in popularity.sorted_scores_[:12]]

2022-11-22 11:43:09,786 - base - recpack - INFO - Fitting Popularity complete - Took 0.217s


In [6]:
# map popular_item_ids to article ids
popular_article_ids = [interaction_matrix._df[interaction_matrix._df['iid'] == pop_iid]['article_id'].values[0] for pop_iid in popular_item_ids]

# Item similarity

#### helpers

In [7]:
def top_n_idx_sparse(matrix: csr_matrix, n: int) -> list[list]:
    """Return index of top n values in each row of a sparse matrix.
    source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    """
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist())
    return top_n_idx

def get_top_k_similar_articles_per_user(prediction_matrix: csr_matrix, interaction_matrix: InteractionMatrix, k: int) -> pd.DataFrame:
    """given a prediction matrix and a transaction matrix, return a dataframe with the top k similar articles per user
    """
    # use interaction_matrix._df to map back to original customer and article ids
    uid_cid_map = interaction_matrix._df[["uid", "customer_id"]].drop_duplicates().set_index("uid").to_dict()["customer_id"]
    iid_aid_map = interaction_matrix._df[["iid", "article_id"]].drop_duplicates().set_index("iid").to_dict()["article_id"]

    # get column indices of top k articles per user
    top_k_idx = top_n_idx_sparse(prediction_matrix, k)

    similar_customers = []
    similar_articles = []

    for i, row in enumerate(top_k_idx):
        user_predictions = [iid_aid_map[iid] for iid in row]
        similar_customers.extend([uid_cid_map[i]] * len(user_predictions))
        similar_articles.extend(user_predictions)
        
    assert len(similar_customers) == len(similar_articles), "lengths of lists should be equal"
    return pd.DataFrame({"customer_id": similar_customers, "article_id": similar_articles})

### ItemKNN

In [8]:
knn = ItemKNN(K=80, normalize_X=False, normalize_sim=True, similarity='cosine')
knn.fit(interaction_matrix)
prediction_matrix_knn = knn.predict(interaction_matrix)

2022-11-22 11:43:12,323 - base - recpack - INFO - Fitting ItemKNN complete - Took 2.45s


In [9]:
similarity_recommendations_knn = get_top_k_similar_articles_per_user(prediction_matrix_knn, interaction_matrix, k=12)

### TARSItemKNN

In [10]:
tknn = TARSItemKNN(K=720, fit_decay=0.05, predict_decay=1/3, similarity='cosine')
tknn.fit(interaction_matrix)
prediction_matrix_tknn = tknn.predict(interaction_matrix)

2022-11-22 11:43:41,946 - base - recpack - INFO - Fitting TARSItemKNN complete - Took 21.5s


In [11]:
similarity_recommendations_tknn = get_top_k_similar_articles_per_user(prediction_matrix_tknn, interaction_matrix, k=12)

# Calculate predictions

In [12]:
c_id2predicted_article_ids_knn = similarity_recommendations_knn.groupby('customer_id')['article_id'].apply(list).to_dict()
c_id2predicted_article_ids_tknn = similarity_recommendations_tknn.groupby('customer_id')['article_id'].apply(list).to_dict()

# Create submission

### Popular items only

In [13]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [14]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = popular_article_ids
    preds.append(pred[:12])

In [15]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [16]:
sub_name = 'submission_Popularity_baseline_20weeks'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)

### ItemKNN

In [17]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [18]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids_knn.get(c_id, [])
    pred = pred + popular_article_ids
    preds.append(pred[:12])

In [19]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [20]:
sub_name = 'submission_ItemKNN_baseline_20weeks'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)

### TARSItemKNN

In [21]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [22]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids_tknn.get(c_id, [])
    pred = pred + popular_article_ids
    preds.append(pred[:12])

In [23]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [24]:
sub_name = 'submission_TARSItemKNN_baseline_20weeks'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)