In [52]:
import pandas as pd

In [53]:
train = pd.read_csv("dataset/split/fold_0/train.csv")

In [59]:
test = pd.read_csv("dataset/split/fold_0/test.csv")

In [54]:
rank = pd.read_csv("./cache/fold_0/ranking/rank.csv")

In [78]:
rank[rank['customer_id'].isin(train['customer_id'])]['customer_id'].iloc[0]

'001cc0d386cd17259b4c18407e3c8303afc9762ea87557c718642f8c5004ae7a'

In [40]:
trn = pd.read_csv("./cache/fold_0/recall/rank_train.csv")
tst = pd.read_csv("./cache/fold_0/recall/rank_test.csv")

In [42]:
from tqdm import tqdm

def recall_select(trn, target_cols: List[int]):
    # Create a dictionary to store the new dataframes
    new_dataframes = {cid: [] for cid in trn['customer_id'].unique()}

    for col in tqdm(target_cols, desc="selecting recall pipeline"):
        # Step 2: Create a new dataframe for each selected column where col == 1
        new_df = trn[trn[col] == 1][[col, f"{col}_score", "purchased", "customer_id", "article_id"]]
        
        # Store the new dataframe in the dictionary
        for cid, _df in new_df.groupby("customer_id"):
            new_dataframes[cid].append(_df.reset_index(drop=True))
    
    score_cols = [f"{col}_score" for col in target_cols]
    for cid in tqdm(new_dataframes, desc="Sorting by average recall score"):
        new_df = pd.concat(new_dataframes[cid]).sort_values(f"{col}_score", ascending=False).fillna(0.0)
        new_df['agg_score'] = new_df.apply(lambda x: sum(x[sc] for sc in score_cols), axis=1)
        new_dataframes[cid] = new_df.sort_values("agg_score", ascending=False)

    return new_dataframes

debug = recall_select(tst, ['also_buy', 'product'])

selecting recall pipeline: 100%|██████████| 2/2 [00:00<00:00,  2.10it/s]
Sorting by average recall score: 100%|██████████| 3293/3293 [00:14<00:00, 233.60it/s]


{'d87b63c024f5a45e632032451003c660e3db5eaa4ea2b121dfe2a28603d7adab':     also_buy  also_buy_score  purchased   
 9        0.0            0.00          0  \
 29       1.0            1.00          0   
 74       1.0            0.99          0   
 16       1.0            0.98          0   
 12       0.0            0.00          0   
 ..       ...             ...        ...   
 0        1.0            0.04          0   
 53       1.0            0.03          0   
 37       0.0            0.00          0   
 6        1.0            0.02          0   
 26       1.0            0.01          0   
 
                                           customer_id  article_id  product   
 9   d87b63c024f5a45e632032451003c660e3db5eaa4ea2b1...   695545001      1.0  \
 29  d87b63c024f5a45e632032451003c660e3db5eaa4ea2b1...   701924001      0.0   
 74  d87b63c024f5a45e632032451003c660e3db5eaa4ea2b1...   689389002      0.0   
 16  d87b63c024f5a45e632032451003c660e3db5eaa4ea2b1...   673309001      0.0   
 12  d8

In [36]:
import pandas as pd
from typing import List

def process_dataframe(trn, target_cols: List[int] = ['also_buy', 'img']):
    # Create a dictionary to store the new dataframes
    new_dataframes = {cid: [] for cid in trn['customer_id'].unique()}

    for col in target_cols:
        # Step 2: Create a new dataframe for each selected column where col == 1
        new_df = trn[trn[col] == 1][[col, f"{col}_score", "purchased", "customer_id", "article_id"]]
        
        # Store the new dataframe in the dictionary
        for cid, _df in new_df.groupby("customer_id"):
            new_dataframes[cid].append(_df.reset_index(drop=True))
    
    score_cols = [f"{col}_score" for col in target_cols]
    for cid in new_dataframes:
        new_df = pd.concat(new_dataframes[cid]).sort_values(f"{col}_score", ascending=False).fillna(0.0)
        new_df['agg_score'] = new_df.apply(lambda x: sum(x[sc] for sc in score_cols), axis=1)
        new_dataframes[cid] = new_df.sort_values("agg_score", ascending=False)

    return new_dataframes

# Example usage
# Assuming 'trn' is your dataframe
new_dfs = process_dataframe(trn)

In [39]:
pd.concat(new_dfs.values())

Unnamed: 0,also_buy,also_buy_score,purchased,customer_id,article_id,img,img_score,agg_score
34,0.0,0.00,0,0c2f21b7b1668cc36805ab798c631e8d5e0b7ae81bec0a...,706016002,1.0,1.00,1.00
22,0.0,0.00,0,0c2f21b7b1668cc36805ab798c631e8d5e0b7ae81bec0a...,699075002,1.0,0.99,0.99
37,1.0,0.99,0,0c2f21b7b1668cc36805ab798c631e8d5e0b7ae81bec0a...,727682001,0.0,0.00,0.99
42,1.0,0.98,0,0c2f21b7b1668cc36805ab798c631e8d5e0b7ae81bec0a...,726177002,0.0,0.00,0.98
18,0.0,0.00,0,0c2f21b7b1668cc36805ab798c631e8d5e0b7ae81bec0a...,699077002,1.0,0.98,0.98
...,...,...,...,...,...,...,...,...
31,1.0,0.03,0,efb7f721c7b92f4aa24ff214a236055761eaf71a8cd0b7...,706016002,0.0,0.00,0.03
75,0.0,0.00,0,efb7f721c7b92f4aa24ff214a236055761eaf71a8cd0b7...,573716002,1.0,0.02,0.02
75,1.0,0.02,0,efb7f721c7b92f4aa24ff214a236055761eaf71a8cd0b7...,768610002,0.0,0.00,0.02
93,0.0,0.00,0,efb7f721c7b92f4aa24ff214a236055761eaf71a8cd0b7...,561797001,1.0,0.01,0.01


In [31]:
new_dfs['00798bd464457d23d6af401715fe32d5c676ad9ee4010d6a3c02580994d31124'].keys()

dict_keys(['also_buy', 'img', 'product', 'pop', 'user_cf', 'item2vec', 'postal'])

In [10]:
"""
TODO:
1. find all columns that does not contain `id` and `score`. Each of these columns contains binary value that
represents whether the item is recalled for the user with this recall pipeline

2. make a new dataframe for each selected column, say `col`, where col=1. The new dataframes have columns:
[`col`, `col`_score, purchased, customer_id, article_id]. The original dataframe contains all these columns
"""

'\nTODO:\n1. find all columns that does not contain `id` and `score`. Each of these columns contains binary value that represents whether the item is recalled for the user with this recall pipeline\n2. \n'

In [None]:
"""
TODO:
1. text embedding class: cluster + popularity, cluster + content-based
2. image embedding class: cluster + popularity, cluster + content-based
3. age group
4. model feature importance intepretations and dump
5. re-run on full users
6. re-run on selected pipelines, according to importance intepretations
"""

In [2]:
import pandas as pd
import numpy as np

store_path = "./feature/dino_image_emb.npy"
emb = np.load(store_path, allow_pickle=True).item()

In [65]:
def age_group_recall(train: pd.DataFrame, customers: pd.DataFrame, purchase_count: pd.DataFrame, *args, **kwargs):
    # Step 1: Add an age group tag for customers according to customer's `age`, with age interval size 5
    customers['age_group'] = (customers['age'] // 5) * 5
    
    # Step 2: Merge train data with customers' age group
    trn_age_group = pd.merge(train, customers[['customer_id', 'age_group']], on=['customer_id'])

    # Step 3: Merge the result with purchase count
    pop_trn_age_group = pd.merge(trn_age_group, purchase_count, on=['article_id'])

    # Step 4: Group by age group and sort by count
    trn_age_group_group = {}
    for age_group, group in pop_trn_age_group.groupby("age_group"):
        _group = group.sort_values("count", ascending=False)[['article_id', 'count']].drop_duplicates("article_id")
        trn_age_group_group[age_group] = _group

    # Create a map of customers to their age group
    customers_age_group_map = pd.Series(data=list(customers['age_group']), index=customers['customer_id']).to_dict()

    return trn_age_group_group, customers_age_group_map

def postal_code_recall(train: pd.DataFrame, customers: pd.DataFrame, purchase_count: pd.DataFrame, *args, **kwargs):
    trn_postal = pd.merge(train, customers[['customer_id', 'postal_code']], on=['customer_id'])

    pop_trn_postal = pd.merge(trn_postal, purchase_count, on=['article_id'])
    trn_postal_group = {}

    for postal_code, group in pop_trn_postal.groupby("postal_code"):
        _group = group.sort_values("count", ascending=False)[['article_id', 'count']].drop_duplicates("article_id")
        trn_postal_group[postal_code] = _group

    customers_postal_code_map = pd.Series(data=list(customers['postal_code']), index=customers['customer_id']).to_dict()

    return trn_postal_group, customers_postal_code_map

def popularity_recall(train: pd.DataFrame, *args, **kwargs):
    """
    Recall the most popular items in the training period
    """
    counts = train['article_id'].value_counts()
    purchase_count = pd.DataFrame(counts).reset_index()

    return purchase_count

In [66]:
train = pd.read_csv("./dataset/split/fold_0/train.csv")
customers = pd.read_csv("./dataset/customers.csv")
articles = pd.read_csv("./dataset/articles.csv")

In [72]:
test = pd.read_csv("./dataset/split/fold_0/test.csv")

In [67]:
purchase_count = popularity_recall(train)

In [74]:
purchase_dict = test.groupby('customer_id')['article_id'].agg(list)

In [76]:
precision = []
recall = []
for cid, purchased in purchase_dict.values():
    precision.append()

AttributeError: 'Series' object has no attribute 'iterrows'

In [70]:
purchase_count[:520]['article_id']

0      689109001
1      706016001
2      692930001
3      706016002
4      689109003
         ...    
515    686265006
516    539723038
517    707488004
518    746332002
519    688873002
Name: article_id, Length: 520, dtype: int64

In [31]:
trn_age_group_group, customers_age_group_map = age_group_recall(train, customers, purchase_count)

In [32]:
# trn_postal_code_group, customers_postal_code_map = postal_code_recall(train, customers, purchase_count)

In [34]:
trn_age_group_group[45.0]

Unnamed: 0,article_id,count
44049,689109001,9297
302887,706016001,7728
60946,692930001,7558
474078,706016002,6422
285395,689109003,6187
...,...,...
3682203,619797001,1
3682166,516614001,1
3697747,493666011,1
3652585,636355003,1


In [35]:
dataframes = []

# Iterate through the dictionary
for group, df in trn_age_group_group.items():
    # Add the 'group' column to the DataFrame
    df['group'] = group
    # Drop duplicates based on 'article_id'
    df = df.drop_duplicates(subset=['article_id'])
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
result_df = pd.concat(dataframes, ignore_index=True)

In [39]:
result_df

Unnamed: 0,article_id,count,group
0,689109001,9297,15.0
1,706016001,7728,15.0
2,692930001,7558,15.0
3,706016002,6422,15.0
4,689109003,6187,15.0
...,...,...,...
276860,720810001,42,95.0
276861,584631020,41,95.0
276862,717676001,39,95.0
276863,564314018,29,95.0


In [80]:
"""
TODO:
1. content-based redo, precompute similarities between all items
2. data filtering pipeline, only select a small portion of users
3. ablation study develop
"""