# Baseline Ssubmission
This notebook generates three simple baselines based on the last 20 weeks of data and makes submission files for them to upload to Kaggle.

## Summary
There is no candidate generation or ranking, we simply generate items and use them directly in the submission.

The baselines used are:
- **`Popularity`**
  - Simple popularity based on purchase count
  - All customers get same 12 most popular items recommended
- **`ItemKNN`**
  - trained with parameters found in `comparison-KNN.ipynb`
  - filled with `Popularity` if there's not enough data for a customer
- **`TARSItemKNN`**
  - trained with parameters found in `comparison-KNN.ipynb`
  - filled with `Popularity` if there's not enough data for a customer

Results:

  | Baseline      | Private Score | Public Score  |
  |:-------------:|:-------------:|:-------------:|
  | `Popularity`  | 0.00322       | 0.00308       |
  | `ItemKNN`     | 0.00360       | 0.00348       |
  | `TARSItemKNN` | 0.00453       | 0.00413       |


---

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.algorithms import ItemKNN, TARSItemKNN, Popularity
from recpack.matrix import InteractionMatrix

from helpers.utils import DATA_PATH, customer_hex_id_to_int
from helpers.evaluation import apk
from candidates import top_n_idx_sparse, get_top_k_similar_articles_per_user

In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')

In [3]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > test_week - 20]

# Preprocessing + Scenario Setup

In [4]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(10, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(10, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

  0%|          | 0/3971573 [00:00<?, ?it/s]

  0%|          | 0/3971573 [00:00<?, ?it/s]

# Generating baseline

### Most popular articles

In [5]:
popularity = Popularity(K=12)
popularity.fit(interaction_matrix)
popular_item_ids = [id for id, _ in popularity.sorted_scores_[:12]]

2022-12-22 16:51:22,073 - base - recpack - INFO - Fitting Popularity complete - Took 0.423s


# Item similarity

### ItemKNN

In [6]:
knn = ItemKNN(K=90, normalize_X=False, normalize_sim=True, similarity='cosine')
knn.fit(interaction_matrix)
prediction_matrix_knn = knn.predict(interaction_matrix)

2022-12-22 16:51:25,563 - base - recpack - INFO - Fitting ItemKNN complete - Took 3.47s


In [7]:
similarity_recommendations_knn = get_top_k_similar_articles_per_user(prediction_matrix_knn, interaction_matrix, k=12)

### TARSItemKNN

In [8]:
tknn = TARSItemKNN(K=580, fit_decay=0.1, predict_decay=1/3, similarity='cosine')
tknn.fit(interaction_matrix)
prediction_matrix_tknn = tknn.predict(interaction_matrix)

2022-12-22 16:52:12,240 - base - recpack - INFO - Fitting TARSItemKNN complete - Took 18.4s


In [9]:
similarity_recommendations_tknn = get_top_k_similar_articles_per_user(prediction_matrix_tknn, interaction_matrix, k=12)

# Calculate predictions

In [10]:
# popularity
popular_article_ids = [interaction_matrix._df[interaction_matrix._df['iid'] == pop_iid]['article_id'].values[0] for pop_iid in popular_item_ids]
# ItemKNN
c_id2predicted_article_ids_knn = similarity_recommendations_knn.groupby('customer_id')['article_id'].apply(list).to_dict()
# TARSItemKNN
c_id2predicted_article_ids_tknn = similarity_recommendations_tknn.groupby('customer_id')['article_id'].apply(list).to_dict()

# Create submission

### Popular items only

In [11]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [12]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = popular_article_ids
    preds.append(pred[:12])

In [13]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [14]:
sub_name = 'submission_Popularity_baseline'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)

### ItemKNN

In [15]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [16]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids_knn.get(c_id, [])
    pred = pred + popular_article_ids
    preds.append(pred[:12])

In [17]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [18]:
sub_name = 'submission_ItemKNN_baseline'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)

### TARSItemKNN

In [19]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [20]:
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids_tknn.get(c_id, [])
    pred = pred + popular_article_ids
    preds.append(pred[:12])

In [21]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [22]:
sub_name = 'submission_TARSItemKNN_baseline'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)