In [68]:
import gc
import re
import json
import requests
import numpy as np
import pandas as pd
import scipy.io as sio
from collections import Counter
import os
import random
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from logging import getLogger
import sys
from data_utilis import Preprocessor_fairagent, GetInfo, split_data, split_test
import daisy
import importlib
importlib.reload(daisy)
from daisy.utils.loader import RawDataReader, Preprocessor
from daisy.utils.splitter import TestSplitter, ValidationSplitter
from daisy.utils.config import init_seed, init_config, init_logger
from daisy.utils.metrics import MAP, NDCG, Recall, Precision, HR, MRR
from daisy.utils.sampler import BasicNegtiveSampler, SkipGramNegativeSampler, UniqueNegativeSampler
from daisy.utils.dataset import AEDataset, BasicDataset, CandidatesDataset, get_dataloader
from daisy.utils.utils import get_history_matrix, get_ur, build_candidates_set, ensure_dir, get_inter_matrix

In [None]:
config = init_config()

''' init seed for reproducibility '''
init_seed(config['seed'], config['reproducibility'])

''' init logger '''
init_logger(config)
logger = getLogger()
logger.info(config)
config['logger'] = logger

In [72]:
save_path = config['save_path'] + config['version']
ensure_dir(save_path)

file_path = save_path + f'{config["dataset"]}/'
ensure_dir(file_path)

saved_data_path = file_path + 'data/'
ensure_dir(saved_data_path)

saved_result_path = file_path + f'{config["algo_name"]}/'
ensure_dir(saved_result_path)

saved_model_path = saved_result_path + 'model/'
ensure_dir(saved_model_path)

saved_rec_path = saved_result_path + 'rec_list/'
ensure_dir(saved_rec_path)

saved_metric_path = saved_result_path + 'metric/'
ensure_dir(saved_metric_path)

In [74]:
data_path = './origin_data/'
origin_data = pd.read_csv(data_path + 'small_matrix.csv')
origin_data = origin_data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
origin_data = origin_data.rename(columns={'user_id': config['UID_NAME'], 'video_id': config['IID_NAME'], 'watch_ratio': config['INTER_NAME']})

In [6]:
origin_data

Unnamed: 0,user,item,play_duration,video_duration,time,date,timestamp,label
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3650,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5263,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8235,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4676370,7162,9178,5315,37205,2020-09-01 20:06:35.984,20200901.0,1.598962e+09,0.142857
4676371,7162,4988,10085,8167,2020-09-02 14:44:51.342,20200902.0,1.599029e+09,1.234848
4676372,7162,7989,50523,49319,2020-09-03 08:45:01.474,20200903.0,1.599094e+09,1.024412
4676373,7162,6534,2190,8000,2020-09-04 22:56:32.021,20200904.0,1.599231e+09,0.273750


In [None]:
# process data
df = origin_data.copy()
processor = Preprocessor_fairaget(config)
df = processor.process(df)
user_num, item_num, time_dict = processor.user_num, processor.item_num, processor.time_dict

In [60]:
## save data
df.to_csv(saved_data_path + 'all_data_set.csv')

In [77]:
# store information of user number and item number
config['user_num'] = user_num
config['item_num'] = item_num

print(f"user number: {user_num}  item number: {item_num}  interaction number: {len(df)}")

user number: 1411  item number: 3317  interaction number: 1471862


In [12]:
# split train_base/30% train_rl/20 test/10% * 5 for each test stage.
train_base_ids, train_rl_ids, test_ids = split_data(df, config['train_base_ratio'], config['train_rl_ratio'])
train_base_set, train_rl_set, test_set = df.iloc[train_base_ids, :].copy(), df.iloc[train_rl_ids, :].copy(), df.iloc[test_ids, :].copy()
test_df_list = split_test(test_set, config['test_num'])

In [14]:
# If there are new users, filter
set_info = GetInfo(train_base_set,train_rl_set,test_df_list,config)
set_info.get_info()

---Info:train_base---
user number: 1411  item number: 1319
len: 441559

---Info:train_rl---
len: 294372
number of user: 1411
number of new-user: 0
number of item: 1916
number of new-item: 676

---Test period 0---
len: 147186
number of user: 1409
number of new-user: 0
number of item: 2070
number of new-item: 351

---Test period 1---
len: 147186
number of user: 1411
number of new-user: 0
number of item: 2135
number of new-item: 254

---Test period 2---
len: 147186
number of user: 1409
number of new-user: 0
number of item: 2118
number of new-item: 296

---Test period 3---
len: 147186
number of user: 1410
number of new-user: 0
number of item: 1827
number of new-item: 233

---Test period 4---
len: 147187
number of user: 1410
number of new-user: 0
number of item: 1578
number of new-item: 188


In [15]:
# get initial warm and cold item list for training FairAgent and baselines.
warm_item_list = train_base_set[config['IID_NAME']].unique()

cold_item_list = list(set(df[config['IID_NAME']].unique()) - set(warm_item_list))

In [16]:
warm_item_list

array([   0,    1,    2, ..., 1316, 1317, 1318], dtype=int16)

In [17]:
train_total_set = pd.concat([train_base_set, train_rl_set])
train_total_set

Unnamed: 0,user,item,play_duration,video_duration,time,date,timestamp,label,rating
0,1221,0,29789,15034,2020-07-04 02:23:26.06,20200705.0,1.593801e+09,1.981442,1.0
1,1221,1,33766,10600,2020-07-04 06:50:30.434,20200705.0,1.593817e+09,3.185472,1.0
2,602,2,13598,9167,2020-07-05 00:00:49.448,20200705.0,1.593878e+09,1.483364,1.0
3,6,3,9883,7200,2020-07-05 00:01:03.816,20200705.0,1.593878e+09,1.372639,1.0
4,164,4,11541,11000,2020-07-05 00:01:40.379,20200705.0,1.593879e+09,1.049182,1.0
...,...,...,...,...,...,...,...,...,...
735926,640,1949,15111,8843,2020-07-31 05:51:36.332,20200731.0,1.596146e+09,1.708809,1.0
735927,995,91,11040,9100,2020-07-31 05:51:37.397,20200731.0,1.596146e+09,1.213187,1.0
735928,1058,1951,8453,7267,2020-07-31 05:51:39.554,20200731.0,1.596146e+09,1.163204,1.0
735929,700,1881,6940,6667,2020-07-31 05:51:39.708,20200731.0,1.596146e+09,1.040948,1.0


In [18]:
# Get historical interaction as ground-truth
train_base_ur = get_ur(train_base_set)
train_rl_ur = get_ur(train_rl_set)
train_total_ur = get_ur(train_total_set)

In [19]:
# Negtive sample for traininng backbone models
config['train_ur'] = train_total_ur
sampler = UniqueNegativeSampler(train_total_set, config)
train_samples = sampler.sampling()
train_dataset = BasicDataset(train_samples)

In [22]:
list_test = train_samples[:20]
list_test

array([[1221,    0, 1035],
       [1221,    0,  851],
       [1221,    0,  984],
       [1221,    0, 1023],
       [1221,    1,  899],
       [1221,    1,  909],
       [1221,    1,  767],
       [1221,    1,  307],
       [ 602,    2,  300],
       [ 602,    2,  660],
       [ 602,    2,  478],
       [ 602,    2,  193],
       [   6,    3,  480],
       [   6,    3,  736],
       [   6,    3,  771],
       [   6,    3,  242],
       [ 164,    4,  555],
       [ 164,    4,  387],
       [ 164,    4,  330],
       [ 164,    4,  918]], dtype=int32)

In [23]:
# Get candidate sets for testing
test_u_all = []
test_ucands_all = []
test_ur_all = []
for i in range(config['test_num']):
    test_ur = get_ur(test_df_list[i])
    test_u, test_ucands = build_candidates_set(test_ur, train_total_ur, config)
    test_u_all.append(test_u)
    test_ucands_all.append(test_ucands)
    test_ur_all.append(test_ur)

In [24]:
np.save(saved_data_path + 'warm_item_list.npy',warm_item_list)
np.save(saved_data_path + 'cold_item_list.npy',cold_item_list)
np.save(saved_data_path + 'train_base_samples.npy',train_samples)
train_base_set.to_csv(saved_data_path + 'train_base_set.csv')
np.save(saved_data_path + 'test_u_all.npy',test_u_all)
np.save(saved_data_path + 'test_ucands_all.npy',test_ucands_all)
np.save(saved_data_path + 'test_ur_all.npy',test_ur_all)

In [None]:
def save_ur(ur, saved_data_path, saved_name):
    ur_dict = {key: list(value) for key, value in ur.items()}
    with open(saved_data_path + saved_name, 'w') as file:
        json.dump(ur_dict, file)

In [25]:
# Get candidate set of train_rl to train FairAgent
train_rl_u, train_rl_ucands = build_candidates_set(train_rl_ur, train_base_ur, config)
train_total_ur = get_ur(train_total_set)

# save_ur(train_base_ur, saved_data_path, 'train_base_ur.json')
# save_ur(train_rl_ur, saved_data_path, 'train_rl_ur.json')
save_ur(train_total_ur, saved_data_path, 'train_total_ur.json')

In [29]:
np.save(saved_data_path + 'train_rl_u.npy',train_rl_u)
np.save(saved_data_path + 'train_rl_ucands.npy',train_rl_ucands)
np.save(saved_data_path + 'train_rl_ur.npy',train_rl_ur)

train_rl_set.to_csv(saved_data_path + 'train_rl_set.csv')
np.save(saved_data_path + 'train_total_samples.npy',train_samples)
train_total_set.to_csv(saved_data_path + 'train_total_set.csv')
np.save(saved_data_path + 'ui_cate.npy',[user_num, item_num])

In [52]:
for i in range(len(test_df_list)):
    test_df_list[i].to_csv(saved_data_path + f'test_set_{i}.csv')

4

In [50]:
user_rl_df = train_rl_set.groupby('user').apply(
    lambda x: x.sort_values('timestamp')['item'].tolist()
).reset_index(name='user_history')
user_rl_df

Unnamed: 0,user,user_history
0,0,"[1304, 1283, 1099, 1321, 1378, 819, 1162, 1274..."
1,1,"[1270, 1109, 93, 399, 1264, 1185, 1062, 1310, ..."
2,2,"[1290, 1283, 1286, 1109, 551, 1340, 1314, 93, ..."
3,3,"[1291, 961, 1270, 710, 1008, 215, 1314, 1289, ..."
4,4,"[1283, 1301, 1285, 1287, 1316, 1076, 789, 1314..."
...,...,...
1406,1406,"[1308, 1257, 1360, 1404, 1372, 1228, 1407, 143..."
1407,1407,"[438, 1308, 1270, 1266, 1109, 153, 1267, 1257,..."
1408,1408,"[1284, 452, 1257, 1206, 1289, 654, 1109, 1308,..."
1409,1409,"[1315, 1220, 1362, 1360, 1221, 1369, 716, 993,..."


In [51]:
user_total_df = train_total_set.groupby('user').apply(
    lambda x: x.sort_values('timestamp')['item'].tolist()
).reset_index(name='user_history')
user_total_df

Unnamed: 0,user,user_history
0,0,"[57, 10, 79, 117, 49, 34, 6, 133, 52, 87, 13, ..."
1,1,"[7, 79, 47, 3, 117, 133, 120, 101, 107, 113, 1..."
2,2,"[54, 39, 83, 16, 20, 109, 14, 185, 55, 18, 9, ..."
3,3,"[7, 57, 41, 44, 12, 10, 114, 78, 107, 52, 25, ..."
4,4,"[17, 203, 205, 170, 251, 284, 289, 291, 336, 3..."
...,...,...
1406,1406,"[24, 13, 53, 73, 79, 12, 208, 196, 278, 303, 3..."
1407,1407,"[427, 73, 229, 71, 222, 552, 145, 20, 9, 291, ..."
1408,1408,"[28, 34, 114, 203, 43, 179, 46, 192, 205, 208,..."
1409,1409,"[32, 57, 196, 192, 62, 216, 54, 271, 263, 218,..."


## Get user historical preference for training FairAgent

In [53]:
import numpy as np

def set_values_by_id(interaction_history, item_num):
    """
    Convert a user's interaction history (list) into a fixed-length list where interacted items are marked as 1 and non-interacted items as 0.

    Args:
        interaction_history (list): List of interacted item IDs.
        item_num (int): Total number of items.

    Returns:
        list: A binary list indicating interactions.
    """
    # Convert interaction_history to a set for efficient membership checking
    interaction_set = set(interaction_history)
    return [1 if i in interaction_set else 0 for i in range(item_num)]

def get_weight(warm_item_list, cold_item_list):
    """
    Calculate weights for warm and cold items.

    Args:
        warm_item_list (list): List of warm item IDs.
        cold_item_list (list): List of cold item IDs.

    Returns:
        tuple: Two numpy arrays containing weights for warm and cold items.
    """
    warm_num = len(warm_item_list)
    warm_weight = np.arange(warm_num, 0, -1)  # Weights from warm_num to 1

    cold_num = len(cold_item_list)
    if cold_num > 1:
        cold_weight = 1 + np.arange(cold_num) * (warm_num - 1) / (cold_num - 1)
    else:
        cold_weight = np.array([1])
    return warm_weight, cold_weight

def get_user_hist_tgf(exp_list, warm_item_list, cold_item_list, warm_weight, cold_weight):
    """
    Calculate the user's category weight difference (tgf).

    Args:
        exp_list (list): List of experience values for each item.
        warm_item_list (list): List of warm item IDs.
        cold_item_list (list): List of cold item IDs.
        warm_weight (numpy array): Weights for warm items.
        cold_weight (numpy array): Weights for cold items.

    Returns:
        float: The calculated user TGF value.
    """
    if np.sum(exp_list) == 0:
        return 0

    # Normalize the experience values
    exp_list = exp_list / np.sum(exp_list)

    # Extract experience values for warm and cold items
    warm_exp_list = exp_list[warm_item_list]
    cold_exp_list = exp_list[cold_item_list]

    # Calculate warm and cold parts
    warm_part = np.sum(warm_exp_list * warm_weight) / len(warm_item_list)
    cold_part = np.sum(cold_exp_list * cold_weight) / len(cold_item_list)

    # Calculate the category weight difference (TGF)
    user_tgf = warm_part - cold_part
    return user_tgf

def cal_tgf(exp_list, warm_item_list, cold_item_list):
    """
    Calculate the category weight difference (tgf).

    Args:
        exp_list (list): List of experience values for each item.
        warm_item_list (list): List of warm item IDs.
        cold_item_list (list): List of cold item IDs.

    Returns:
        float: The calculated TGF value.
    """
    # Return 0 if exp_list is empty
    if len(exp_list) == 0:
        return 0

    # Normalize the experience values
    total_exp = np.sum(exp_list)
    if total_exp != 0:
        exp_list = exp_list / total_exp

    # Extract experience values for warm and cold items
    warm_exp_list = [exp_list[item] for item in warm_item_list]
    cold_exp_list = [exp_list[item] for item in cold_item_list]

    # Calculate weights for warm items
    warm_num = len(warm_exp_list)
    warm_weight = np.arange(warm_num, 0, -1)  # Weights from warm_num to 1

    # Calculate weights for cold items
    cold_num = len(cold_exp_list)
    if cold_num > 1:
        cold_weight = 1 + np.arange(cold_num) * (warm_num - 1) / (cold_num - 1)
    else:
        cold_weight = [1]

    # Calculate warm and cold parts
    warm_part = np.sum(warm_exp_list * warm_weight) / warm_num
    cold_part = np.sum(cold_exp_list * cold_weight) / cold_num

    # Calculate the category weight difference (TGF)
    cate_tgf = warm_part - cold_part
    return cate_tgf

def calculate_nc(input_set, target_set):
    """
    Calculate the proportion of values in the input set that appear in the target set.

    Args:
        input_set (list): Input set of values.
        target_set (list): Target set of values.

    Returns:
        float: Proportion of values in input_set that are present in target_set.
    """
    if not input_set:
        return 0.0
    count = len(set(input_set) & set(target_set))  # Calculate the size of the intersection
    return count / len(input_set)

In [54]:
# Load warm and cold item lists from saved files
warm_item_list = np.load(saved_data_path + 'warm_item_list.npy')
cold_item_list = np.load(saved_data_path + 'cold_item_list.npy')

# Convert warm_item_list and cold_item_list to NumPy arrays for efficient operations
warm_item_list = np.array(warm_item_list)
cold_item_list = np.array(cold_item_list)

# Precompute weights for warm and cold items using the get_weight function
warm_weight, cold_weight = get_weight(warm_item_list, cold_item_list)

# Apply the set_values_by_id function to create an 'exp_list' column in the DataFrame
# This column represents the user's interaction history as a binary list (1 for interacted, 0 for not interacted)
user_rl_df['exp_list'] = user_rl_df['user_history'].apply(lambda x: set_values_by_id(x, config['item_num']))

# Calculate the user's category weight difference (TGF) and store it in the 'hist_tgf' column
user_rl_df['hist_tgf'] = user_rl_df['exp_list'].apply(
    lambda x: get_user_hist_tgf(x, warm_item_list, cold_item_list, warm_weight, cold_weight)
)

# Calculate the proportion of user interactions that are with cold items (NC) and store it in 'rl_nc'
rl_nc = user_rl_df['user_history'].apply(
    lambda x: calculate_nc(x, cold_item_set)
)

# Add the calculated NC values to the DataFrame as a new column 'hist_nc'
user_rl_df['hist_nc'] = rl_nc

In [66]:
rl_user_his_pref = user_rl_df[['user', 'user_history', 'hist_tgf', 'hist_nc']]
rl_user_his_pref.to_csv(saved_data_path + 'rl_user_his_pref.csv')

## Get item pop for training baseline Pearson PD

In [10]:
item_pop_train = train_total_set.item.value_counts().reset_index()
item_pop_train.columns = ['item', 'train_counts']

In [11]:
item_pop_train.max()

item            1994
train_counts    1218
dtype: int64

In [14]:
add_items = list(set(range(config['item_num'])) - set(train_total_set.item) )
# len(add_items)
add_list = []
for i in add_items:
    add_list.append([i, 1])
add_pd = pd.DataFrame(add_list, columns = ['item', 'train_counts'])
item_pop_train.columns = ['item', 'train_counts']
item_pop_train = pd.concat([item_pop_train, add_pd])
item_pop_train = item_pop_train.reset_index(drop = True)
item_pop_train

Unnamed: 0,item,train_counts
0,376,1218
1,444,1207
2,16,1206
3,303,1196
4,452,1190
...,...,...
3312,3312,1
3313,3313,1
3314,3314,1
3315,3315,1


In [15]:
item_pop_train.to_csv(saved_data_path + 'item_pop_train.csv')

In [16]:
item_pop_train.item.max()

3316

## Convert ID for pre-trained content embeddings for training ALDI

In [79]:
kuairec_id = np.load('kuairec_id_list.npy')
kuairec_all_embed = np.load('kuairec_item_content_embedding.npy')

In [83]:
item_embeddings_list = [embedding.tolist() for embedding in kuairec_all_embed]

# 创建 DataFrame
df = pd.DataFrame({
    'item': kuairec_id,
    'item_embedding': item_embeddings_list
})

In [104]:
def convert_id(df, time_dict, id_column='id'):
    """
    Convert the specified ID column in a Pandas DataFrame based on the provided mapping dictionary (time_dict),
    and ensure the ID column is in integer format.

    Args:
        df (pd.DataFrame): The input DataFrame containing the ID column to be converted.
        time_dict (dict): A mapping dictionary in the format {original_id: new_id}.
        id_column (str): The name of the ID column to be converted. Defaults to 'id'.

    Returns:
        pd.DataFrame: A filtered DataFrame containing only rows with IDs present in time_dict,
                      and with the ID column converted to integer format.
    """
    # Convert the ID column to integer format, coercing invalid values to NaN
    df[id_column] = pd.to_numeric(df[id_column], errors='coerce').astype('Int64')

    # Filter the DataFrame to include only rows with IDs present in time_dict
    df_filtered = df[df[id_column].isin(time_dict.keys())]

    # Map the IDs in the filtered DataFrame using the time_dict
    df_filtered[id_column] = df_filtered[id_column].map(time_dict)

    return df_filtered

In [None]:
df_new = convert_id(df, time_dict)
df_new = df_new.sort_values('item')

In [112]:
kuairec_item_embedding = df_new.item_embedding.to_list()
np.shape(kuairec_item_embedding)

(3317, 384)

In [113]:
kuairec_item_embedding = df_new.item_embedding.to_list()
np.shape(kuairec_item_embedding)
np.save(saved_data_path + 'kuairec_item_embedding.npy', kuairec_item_embedding )