In [1]:
import numpy as np
from helper_functions import recall, recall12, mean_recall, calculate_recall_per_customer_batch
from helper_functions import read_parquet_datasets
import pandas as pd
import time

transactions, customers, articles = read_parquet_datasets()

transactions_copy = transactions.copy()
validation = transactions[transactions.week == transactions.week.max()]

test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [6]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
candidates_last_purchase.week=weeks

### Create three features: age, index_group_name and mean purchase price of the customer.

In [7]:
# first create mean price per customer
mean_price_per_c = transactions.groupby('customer_id')['price'].mean().rename('mean_price_per_c').reset_index()
# figure out the customers preferred index group name by looking at past transactions articles' index_group_name
temp_transactions = pd.merge(transactions, articles[['article_id', 'index_group_name']], on='article_id', how='left')
# per customer count the number of purchases of each index_group_name
c2index_group_name = temp_transactions.groupby('customer_id')['index_group_name'].value_counts().rename('count').reset_index()
c2index_group_name = c2index_group_name.sort_values('count', ascending=False).groupby('customer_id').head(1).rename(columns={'index_group_name': 'highest_count_ign_per_c'})
# merge the mean_price_per_c and c2index_group_name dataframes
transactions_with_2feat = pd.merge(c2index_group_name[['customer_id', 'highest_count_ign_per_c']], mean_price_per_c, on='customer_id', how='left')

# per article, find the average age of the customers who bought it
transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
mean_age_per_article = transactions_with_age.groupby('article_id')['age'].mean().rename('mean_age_per_a').astype('int8').reset_index()

# merge to get the mean_price_per_c, highest_count_ign_per_c and mean_age_per_a columns. This forms the transactions_with_3feat (transactions with three features) dataframe
transactions_with_age_2feat = pd.merge(transactions_with_age, transactions_with_2feat, on='customer_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_age_2feat, mean_age_per_article, on='article_id', how='left')
transactions_with_3feat = pd.merge(transactions_with_3feat, articles[['article_id', 'index_group_name']], on='article_id', how='left') # to get the index_group_name (back)

In [8]:
transactions_with_3feat

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,age,highest_count_ign_per_c,mean_price_per_c,mean_age_per_a,index_group_name
0,2020-07-15,272412481300040,778064028,0.008458,1,95,48,0,0.030597,32,0
1,2020-07-15,272412481300040,816592008,0.016932,1,95,48,0,0.030597,37,2
2,2020-07-15,272412481300040,621381021,0.033881,1,95,48,0,0.030597,35,2
3,2020-07-15,272412481300040,817477003,0.025407,1,95,48,0,0.030597,33,0
4,2020-07-15,272412481300040,899088002,0.025407,1,95,48,0,0.030597,35,2
...,...,...,...,...,...,...,...,...,...,...,...
2762867,2020-09-22,18439937050817258297,891591003,0.084729,2,104,49,0,0.040379,41,0
2762868,2020-09-22,18439937050817258297,869706005,0.084729,2,104,49,0,0.040379,39,0
2762869,2020-09-22,18440902715633436014,918894002,0.016932,1,104,18,0,0.028232,30,0
2762870,2020-09-22,18440902715633436014,761269001,0.016932,1,104,18,0,0.028232,33,0


### Generate personalized candidates for each customer based on three features: age, index_group_name and mean purchase price of the customer.

In [9]:
# merge t with customers columns customer_id, age on customer_id for later use
t_merged = pd.merge(transactions_with_2feat, customers[['customer_id', 'age']], on='customer_id', how='left')
# Create a DataFrame with unique articles
unique_articles = transactions_with_3feat[['t_dat', 'sales_channel_id', 'article_id', 'price', 'index_group_name', 'week']].drop_duplicates()
# we only keep latest purchase of each article
unique_articles = unique_articles.sort_values('t_dat', ascending=False).drop_duplicates('article_id')

unique_customers = mean_price_per_c['customer_id'].unique()[:1000]
chunk_size = 1000  # Define the number of customers to process at once
chunks = [unique_customers[i:i + chunk_size] for i in range(0, len(unique_customers), chunk_size)]
topX_price=50 # top x candidates filtered by price columns
topX_age=25 # top x candidates filtered by age columns

def get_candidates(customer_chunks, top_x_price=50, top_x_age=25):
    result_candidates_3feat = pd.DataFrame()  # DataFrame to store final candidates
    result_candidates_3feat_chunks = []
    
    for idx, customer_chunk in enumerate(customer_chunks):
        # Cartesian product of unique articles and customers, since we want to choose candidates out of all unique articles for each customer
        candidate_articles = pd.merge(
            unique_articles,
            pd.DataFrame({'customer_id': customer_chunk}),
            how='cross'
        )
        # get the necessary columns to filter out the candidates
        candidate_articles = pd.merge(candidate_articles, t_merged, on='customer_id', how='left')
        candidate_articles = pd.merge(candidate_articles, mean_age_per_article, on='article_id', how='left')
        
        # Select all candidates per customer_id where highest_count_ign_per_c is equal to index_group_name
        candidate_articles = candidate_articles[candidate_articles['highest_count_ign_per_c']==candidate_articles['index_group_name']]
        
        # Calculate price difference for each combination
        candidate_articles['price_difference'] = abs(candidate_articles['mean_price_per_c'] - candidate_articles['price'])
        # Rank articles within each customer group based on price difference
        candidate_articles['price_rank'] = (
            candidate_articles
            .groupby(['week', 'customer_id'])['price_difference']
            .rank(ascending=True, method='min')
        )
        # Select the top 50 candidates for each customer
        top_candidates = (
            candidate_articles
            .sort_values(by=['customer_id', 'week', 'price_rank'])
            .groupby(['week', 'customer_id'])
            .head(top_x_price)
        )
        
        # Calculate age difference for each combination
        top_candidates['age_difference'] = abs(top_candidates['age'] - top_candidates['mean_age_per_a'])
        # Rank articles within each customer group based on age difference
        top_candidates['age_rank'] = (
            top_candidates
            .groupby(['week', 'customer_id'])['age_difference']
            .rank(ascending=True, method='min')
        )
        # Select the top 12 candidates for each customer based on age difference
        top_candidates = (
            top_candidates
            .sort_values(by=['customer_id', 'week', 'age_rank'])
            .groupby(['week', 'customer_id'])
            .head(top_x_age)
        )
        
        result_candidates_3feat_chunks.append(top_candidates) # Append current chunk's candidates to result
            
    # Concatenate all chunks into the final result    
    result_candidates_3feat = pd.concat(result_candidates_3feat_chunks, ignore_index=True)

    top_candidates_3feat = result_candidates_3feat.drop(columns=['price_difference', 'age_difference'])
    return top_candidates_3feat

### Calculate overall mean recall

In [10]:
# Define the total number of customers and batch size
total_customers = mean_price_per_c['customer_id'].nunique()
print("Total customers:", total_customers)
batch_size = 1000

# Initialize an empty list to store mean recalls for each batch
mean_recalls = []

batches = mean_price_per_c['customer_id'].unique()

# Iterate over customer batches
for i in range(0, total_customers, batch_size):
    start = time.time()
    start_idx = i
    end_idx = i + batch_size
    
    # Get the current batch of customers
    current_batch = batches[start_idx:end_idx]
    
    # Get candidates for the current batch
    top_candidates_3feat = get_candidates([current_batch], topX_price, topX_age)
    
    # Increment the week for the candidates to match the next week
    top_candidates_3feat_prev_week = top_candidates_3feat.copy()
    top_candidates_3feat_prev_week.week += 1
    
    # Calculate mean recall for the current batch
    mean_recall_batch = calculate_recall_per_customer_batch(validation, top_candidates_3feat_prev_week, current_batch, topX_age)
    
    # Append mean recall to the list
    mean_recalls.append(mean_recall_batch)
    
    print(f'Batch {i} processed in {time.time() - start:.2f} seconds')

# Calculate overall mean recall
overall_mean_recall = np.mean(mean_recalls)
print("Overall Mean Recall:", overall_mean_recall)

Total customers: 437365
Batch 0 processed in 18.05 seconds
Batch 1000 processed in 18.25 seconds
Batch 2000 processed in 18.51 seconds
Batch 3000 processed in 18.13 seconds
Batch 4000 processed in 18.56 seconds
Batch 5000 processed in 17.71 seconds
Batch 6000 processed in 18.16 seconds
Batch 7000 processed in 17.79 seconds
Batch 8000 processed in 17.90 seconds
Batch 9000 processed in 18.16 seconds
Batch 10000 processed in 17.66 seconds
Batch 11000 processed in 18.12 seconds
Batch 12000 processed in 19.13 seconds
Batch 13000 processed in 18.34 seconds
Batch 14000 processed in 19.16 seconds
Batch 15000 processed in 19.26 seconds
Batch 16000 processed in 19.43 seconds
Batch 17000 processed in 19.50 seconds
Batch 18000 processed in 19.06 seconds
Batch 19000 processed in 19.38 seconds
Batch 20000 processed in 18.90 seconds
Batch 21000 processed in 19.78 seconds
Batch 22000 processed in 19.35 seconds
Batch 23000 processed in 19.01 seconds
Batch 24000 processed in 18.69 seconds
Batch 25000 pr

In [11]:
print(mean_recalls)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,