In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import os

os.chdir('../../../data/')

# Overview

In this notebook I want to generate 100 candidates for each of strategies, to then calculate recall@100 to evaluate them. This is mostly a tweak from candidate generation notebook in lecture_3 folder.

In [2]:
articles_cols = [
    'article_id', 
    'product_code',
    'product_type_no', 
    'graphical_appearance_no', 
    'colour_group_code', 
    'perceived_colour_value_id',
    'perceived_colour_master_id', 
    'department_no', 
    'index_code',
    'index_group_no', 
    'section_no', 
    'garment_group_no'
    ]

In [3]:
# Read compressed data
transactions = pd.read_pickle('../data/compressed_data/transactions_train.pkl')
customers = pd.read_pickle('../data/compressed_data/customers.pkl')
articles = pd.read_pickle('../data/compressed_data/articles.pkl')[articles_cols]

# Calculate week, where 0 is first week of data and 104 is last week of data
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

print('First week num: ', transactions.week.min(), '\nLast week num: ', transactions.week.max(), '\n')

avg_age = np.mean(customers['age'].astype('float32'))
customers['age'].fillna(avg_age.astype(np.float16), inplace=True)

First week num:  0 
Last week num:  104 



In [4]:
test_week = transactions.week.max()

train_weeks = range(test_week - 10, test_week)

transactions_train = transactions[transactions.week.isin(train_weeks)]
transaction_test = transactions[transactions.week == test_week]

## Radek's candidates

Only bestsellers as it might be impossible to get 100 candidates in last purchase, as we are looking at weeks for now.

In [5]:
# bestseller
mean_price = transactions_train.groupby(['week', 'article_id'])['price'].mean()
sales = transactions_train \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(100).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

unique_transactions = transactions_train \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

test_set_transactions = unique_transactions\
    .drop_duplicates('customer_id')\
    .reset_index(drop=True)

test_set_transactions.week = test_week

candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [8]:
candidates_bestsellers_test_week.to_csv('../data/candidates_100/radek_bestsellers.csv', index=False)
bestsellers_previous_week.to_csv('../data/candidates_100/radek_bestsellers_previous_week.csv', index=False)

## Seasonal candidates -- items bought in previous years during similar period to test week (mid september)

In [9]:
seasonal_trans = transactions[(transactions.t_dat.dt.month == 9) & (transactions.t_dat.dt.year.isin([2019, 2018]))]
seasonal_trans['year'] = transactions['t_dat'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seasonal_trans['year'] = transactions['t_dat'].dt.year


In [13]:
seasonal_candidates = seasonal_trans.groupby(['customer_id', 'year'])['article_id'].value_counts()\
    .groupby(['customer_id', 'year']).rank(method='dense', ascending=False) \
    .groupby(['customer_id', 'year']).head(100)\
    .reset_index()\
    .drop(columns=['count'])

seasonal_candidates.to_csv('../data/candidates_100/baskets_seasonal.csv')

seasonal_candidates.sample(5)

Unnamed: 0,customer_id,year,article_id
672597,573628,2018,391772001
801072,683469,2018,634013022
1177798,1006441,2019,772773002
562941,480060,2019,803315001
524164,446950,2018,372860001


In [14]:
best_seasonal_items = seasonal_trans.groupby(['year'])['article_id'].value_counts()\
    .groupby('year').rank(method='dense', ascending=False)\
    .groupby('year').head(100).rename('seasonal_rank').astype('int8')\
    .reset_index()

best_seasonal_items.to_csv('../data/candidates_100/best_seasonal.csv')

best_seasonal_items

Unnamed: 0,year,article_id,seasonal_rank
0,2018,539723005,1
1,2018,685687003,2
2,2018,685687002,3
3,2018,685687004,4
4,2018,685687001,5
...,...,...,...
195,2019,677930023,93
196,2019,563519008,94
197,2019,803757001,95
198,2019,715624011,96


## Diversity based -- select items from categories user has not interacted with

Merely a function to create candidates, for later use in evaluation notebooks.

In [9]:
# A function to create candidates based on the most popular items in the given category
def not_interacted_with_candidates(t, a, articles_col, k=2):

    # Get unique values of given category
    group_unique_values = a[articles_col].unique()
    group_df = pd.merge(t, a[['article_id', articles_col]])

    # Get k most popular articles in given category
    popular_by_group = group_df.groupby(articles_col)['article_id'].value_counts()\
        .groupby(articles_col).head(k).reset_index()
    popular_by_group = popular_by_group[['article_id', articles_col]]

    # Not interacted category for each customer
    not_interacted_with = group_df.groupby('customer_id')[articles_col].unique()\
        .apply(lambda x: np.setdiff1d(group_unique_values, x))\
        .explode().reset_index()
    
    # Join to create recommendation based on lack of interaction
    candidates = pd.merge(not_interacted_with, popular_by_group, on=articles_col)

    return candidates[['customer_id', 'article_id']]

In [138]:
def not_interacted_with_candidates_v2(t, a, articles_col, k=10):
    
    # Get unique values of given category
    group_unique_values = a[articles_col].unique()
    group_df = pd.merge(t, a[['article_id', articles_col]])

    # Not interacted category for each customer
    not_interacted_with = group_df\
        .groupby('customer_id')[articles_col]\
        .apply(lambda x: np.array(list(set(x))))\
        .apply(lambda x: np.setdiff1d(group_unique_values, x))
    
    # Get k most popular articles in given category
    items_popularity = group_df\
        .groupby(articles_col)['article_id']\
        .value_counts()\
        .groupby(articles_col)\
        .head(k)\
        .reset_index()

    # Rank items by popularity (number of purchases)
    items_popularity['not_interacted_rank'] = items_popularity['count']\
        .rank(method='dense', ascending=False)\
        .astype('int16')
    
    items_popularity = items_popularity\
        .filter(items=['article_id', articles_col, 'not_interacted_rank'])\
        .sort_values(by=['not_interacted_rank'])

    candidates = []

    # For each customer get k most popular articles in categories that customer did not interact with
    for cid in tqdm(not_interacted_with.index.values):
        groups = not_interacted_with.loc[cid]

        cid_candidates = items_popularity\
            [items_popularity[articles_col].isin(groups)]\
            .head(k)\
            .drop(columns=[articles_col])
        
        cid_candidates['customer_id'] = cid

        candidates.append(cid_candidates)

    return pd.concat(candidates)[['customer_id', 'article_id', 'not_interacted_rank']]

# Item similarities
 
Run this only once, because it takes a long time to calculate similarity loop as there is a lot of data.

In [26]:
# Normalize data for cosine similarities
df = articles.set_index('article_id')
df = df.drop(columns=['index_code'])
df = (df - df.mean()) / df.std()

# Calculate cosine similarities only for articles that are in transactions_train, to reduce the size of the matrix
articles_ids = transactions_train.article_id.unique() 
df = df[df.index.isin(articles_ids)]

In [41]:
# # For each item, get top 100 most similar items
X = df.to_numpy()
articles_arr = df.index.values
sims = {}
for i, row in tqdm(zip(articles_arr, X)):
    top_n_sim = cosine_similarity(row.reshape(1, -1), X).argsort()[:, -102:-1]
    article_ids = articles_arr[top_n_sim].reshape(-1)
    sims[i] = article_ids

38540it [03:08, 204.51it/s]


In [42]:
## This needed to be dony only once, so I commented it out after running, not to overwrite saved data 
# sims = {int(i):[int(x) for x in v] for i,v in sims.items()}
# with open('../data/item_similarities_100.json', 'w') as f:
#     json.dump(sims, f)

In [181]:
with open('../data/item_similarities_100.json', 'r') as f:
    sims = json.load(f)

## Items similar to purchased but not actually purchased

In [43]:
purchased_by_user = transactions_train.groupby('customer_id')['article_id'].apply(list)

In [48]:
result = []
for cid, bought in tqdm(purchased_by_user.items()):
    similar_to_bought = []
    for bought_item in bought:
        similar_to_bought += sims[bought_item]
    sim_not_bought = [i for i in similar_to_bought if i not in bought][:100]
    result.append(sim_not_bought)

439368it [00:35, 12485.72it/s]


In [49]:
sim_not_bought = pd.DataFrame({
    'customer_id':purchased_by_user.index.values,
    'sim_not_bought':result
}).explode('sim_not_bought')
sim_not_bought

Unnamed: 0,customer_id,sim_not_bought
0,0,629381012
0,0,568456018
0,0,495884013
0,0,584298026
0,0,557908018
...,...,...
439367,1371977,665477011
439367,1371977,628813001
439367,1371977,620573006
439367,1371977,658298002


In [52]:
sim_not_bought.to_csv('../data/candidates_100/similar_not_bought.csv')