In [15]:
# Apply fairness-aware data re-label (inspired by Kamiran et al.) on lastfm-nl dataset for RecBole
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# 1. Load user profile, label interactions and plays interactions
user_df = pd.read_csv('../Post-processing/datasets/lastfm-nl/lastfm-nl.user', sep='\t')  # user_id:token, gender, age, etc.
inter_df_label = pd.read_csv('../datasets/split_datasets/lastfm-nl/lastfm-nl.train.inter', sep='\t')  # user_id:token, artist_id:token, label
inter_df_plays = pd.read_csv('../Post-processing/datasets/lastfm-nl/lastfm-nl.inter', sep='\t')  # user_id:token, artist_id:token, plays

# 2. Merge plays into train.inter
inter_df = inter_df_label.merge(
    inter_df_plays[['user_id:token', 'artist_id:token', 'plays:float']],
    how='left',
    left_on=['user_id:token', 'artist_id:token'],
    right_on=['user_id:token', 'artist_id:token']
)

# 3. Merge gender info
merged_df = inter_df.merge(
    user_df[['user_id:token', 'gender:token']],
    on='user_id:token',
    how='left'
)

merged_df.rename(columns={'gender:token': 'gender'}, inplace=True)

display(merged_df.head())

Unnamed: 0,user_id:token,artist_id:token,label:float,plays:float,gender
0,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,c7e90641-f441-4801-8e4a-d09e10f452b8,1.0,166,M
1,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,41489644-58f8-47e7-a581-e24d5659baeb,1.0,458,M
2,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,609e7afd-3552-4102-9501-7611858ea320,1.0,176,M
3,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,5251b5a0-3e3b-4d07-a152-585009575310,1.0,193,M
4,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,5f6ab597-f57a-40da-be9e-adad48708203,1.0,402,M


In [16]:
# Count total users per gender (based on unique user-gender pairs)
user_gender_map = merged_df[['user_id:token', 'gender']].drop_duplicates()
gender_total = user_gender_map['gender'].value_counts().to_dict()
display(gender_total)

{'M': 7059, 'F': 1733}

In [17]:
# Total label=1 count per gender
print(f"Total interactions: {len(merged_df)}")
label_counts = merged_df['gender'].value_counts().to_dict()
display(label_counts)

# Compute global target ratio
target_ratio = merged_df['label:float'].mean()
print(f"\nGlobal target positive ratio (label==1): {target_ratio:.4f}")

Total interactions: 356414


{'M': 287033, 'F': 69381}


Global target positive ratio (label==1): 0.9950


In [18]:
label_1_counts = merged_df[merged_df['label:float'] == 1]['gender'].value_counts().to_dict()
current_male_1 = label_1_counts.get('M', 0)
current_female_1 = label_1_counts.get('F', 0)
# Compute target positive counts per gender group
target_male_1 = int(target_ratio * label_counts.get('M', 1))
target_female_1 = int(target_ratio * label_counts.get('F', 1))

flip_male_1_to_0 = abs(target_male_1 - current_male_1)
flip_female_0_to_1 = abs(target_female_1 - current_female_1)
print(f"  Target male label=1 count: {target_male_1}, current: {current_male_1}, {flip_male_1_to_0} to flip")
print(f"  Target female label=1 count: {target_female_1}, current: {current_female_1}, {flip_female_0_to_1} to flip")

  Target male label=1 count: 285604, current: 285760, 156 to flip
  Target female label=1 count: 69035, current: 68880, 155 to flip


In [19]:
# Apply relabeling
import numpy as np

# Copy the dataframe to avoid overwriting
balanced_df = merged_df.copy()

# --- Flip male label=1 → 0: pick with lowest plays ---
male_label1 = balanced_df[(balanced_df['gender'] == 'M') & (balanced_df['label:float'] == 1)]
lowest_play_males = male_label1.nsmallest(flip_male_1_to_0, 'plays:float').index
balanced_df.loc[lowest_play_males, 'label:float'] = 0

# --- Flip female label=0 → 1: pick randomly ---
female_label0 = balanced_df[(balanced_df['gender'] == 'F') & (balanced_df['label:float'] == 0)]
flip_indices_female = np.random.choice(female_label0.index, size=flip_female_0_to_1, replace=False)
balanced_df.loc[flip_indices_female, 'label:float'] = 1

In [21]:
before_ratio = merged_df.groupby('gender')['label:float'].value_counts(normalize=True).unstack()
print(before_ratio)
result = balanced_df.groupby('gender')['label:float'].value_counts(normalize=True).unstack()
print("\nFinal label==1 ratio by gender:")
print(result)

label:float       0.0       1.0
gender                         
F            0.007221  0.992779
M            0.004435  0.995565

Final label==1 ratio by gender:
label:float       0.0       1.0
gender                         
F            0.004987  0.995013
M            0.004979  0.995021


In [22]:
# 8. Save re-sampled interaction set
import os

save_path = 'datasets/lastfm-nl/lastfm-nl.train.inter'
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# 保存
balanced_df[['user_id:token', 'artist_id:token', 'label:float']].to_csv(
    save_path,
    sep='\t',
    index=False
)
print(f"✅ Fair resampled dataset saved to {save_path}")

✅ Fair resampled dataset saved to datasets/lastfm-nl/lastfm-nl.train.inter


In [23]:
import shutil

src_path = '../datasets/split_datasets/lastfm-nl/lastfm-nl.valid.inter'
dst_dir = 'datasets/lastfm-nl'
dst_path = os.path.join(dst_dir, 'lastfm-nl.valid.inter')

os.makedirs(dst_dir, exist_ok=True)

# 复制文件
shutil.copyfile(src_path, dst_path)

print(f"✅ File copied to {dst_path}")

src_path = '../datasets/split_datasets/lastfm-nl/lastfm-nl.test.inter'
dst_path = os.path.join(dst_dir, 'lastfm-nl.test.inter')

# 复制文件
shutil.copyfile(src_path, dst_path)

print(f"✅ File copied to {dst_path}")

✅ File copied to datasets/lastfm-nl/lastfm-nl.valid.inter
✅ File copied to datasets/lastfm-nl/lastfm-nl.test.inter


In [24]:
from recbole.quick_start import run_recbole

run_recbole(config_file_list=['lastfm-nl-relabel.yaml'])

10 Jun 12:27    INFO  ['/Users/huangjiaqing/Desktop/Recommender Systems/RScode/.venv/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/huangjiaqing/Library/Jupyter/runtime/kernel-930ab081-4f58-43e6-9f24-0c52d7adf2d5.json']
10 Jun 12:27    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 42
state = INFO
reproducibility = True
data_path = datasets/lastfm-nl
checkpoint_dir = checkpoint_saved/lastfm-nl/
show_progress = False
save_dataset = True
dataset_save_path = None
save_dataloaders = True
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 100
train_batch_size = 1024
learner = adam
learning_rate = 0.0005
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'order': 'RO', 'group_by': '

10 Jun 12:27    INFO  epoch 0 training [time: 3.13s, train loss: 241.3529]
10 Jun 12:28    INFO  epoch 0 evaluating [time: 11.26s, valid_score: 0.030500]
10 Jun 12:28    INFO  valid result: 
precision@10 : 0.0164    recall@10 : 0.0373    hit@10 : 0.1512    ndcg@10 : 0.0305    itemcoverage@10 : 0.0545    averagepopularity@10 : 1185.0535    shannonentropy@10 : 0.0016    giniindex@10 : 0.9983    tailpercentage@10 : 0.0029
10 Jun 12:28    INFO  Saving current: checkpoint_saved/lastfm-nl/BPR-Jun-10-2025_12-27-51.pth
10 Jun 12:28    INFO  epoch 1 training [time: 2.75s, train loss: 200.0739]
10 Jun 12:28    INFO  epoch 1 evaluating [time: 12.54s, valid_score: 0.029400]
10 Jun 12:28    INFO  valid result: 
precision@10 : 0.0156    recall@10 : 0.0353    hit@10 : 0.1424    ndcg@10 : 0.0294    itemcoverage@10 : 0.0094    averagepopularity@10 : 1242.647    shannonentropy@10 : 0.0072    giniindex@10 : 0.9997    tailpercentage@10 : 0.0007
10 Jun 12:28    INFO  epoch 2 training [time: 2.85s, train lo

{'best_valid_score': 0.0623,
 'valid_score_bigger': True,
 'best_valid_result': OrderedDict([('precision@10', 0.0348),
              ('recall@10', 0.0792),
              ('hit@10', 0.3007),
              ('ndcg@10', 0.0623),
              ('itemcoverage@10', 0.144),
              ('averagepopularity@10', 599.6047),
              ('shannonentropy@10', 0.0011),
              ('giniindex@10', 0.9846),
              ('tailpercentage@10', 0.0081)]),
 'test_result': OrderedDict([('precision@10', 0.0345),
              ('recall@10', 0.0785),
              ('hit@10', 0.2979),
              ('ndcg@10', 0.0613),
              ('itemcoverage@10', 0.144),
              ('averagepopularity@10', 599.6047),
              ('shannonentropy@10', 0.0011),
              ('giniindex@10', 0.9846),
              ('tailpercentage@10', 0.0081)])}

In [1]:
from recbole.quick_start import load_data_and_model
from recbole.utils.case_study import full_sort_topk
from tqdm import tqdm
import pandas as pd
import numpy as np

# 1. 加载模型和数据
config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
    model_file='checkpoint_saved/lastfm-nl/BPR-Jun-10-2025_12-27-51.pth'
)

# Step 1: Get all internal user IDs
all_uids = list(range(dataset.user_num))

# Step 2: Filter out users who have no interactions in the test set
valid_uids = [uid for uid in tqdm(all_uids) if test_data.uid2history_item[uid] is not None]

# Step 3: Convert to Series

uid_series = np.array(valid_uids)

# Step 4: Run full_sort_topk
topk_scores, topk_index = full_sort_topk(uid_series, model, test_data, k=10, device=config['device'])

# Step 5: Convert internal item IDs to external tokens
external_item_lists = [dataset.id2token(dataset.iid_field, row.cpu().tolist()) for row in topk_index]
external_user_list = [dataset.id2token(dataset.uid_field, [uid])[0] for uid in uid_series]
score_lists = [row.cpu().tolist() for row in topk_scores]

# load gender information
user_df = pd.read_csv('../Post-processing/datasets/lastfm-nl/lastfm-nl.user', sep='\t')
user_df['user_id:token'] = user_df['user_id:token'].astype(str)
user2gender = dict(zip(user_df['user_id:token'], user_df['gender:token']))

# Step 6: Save as DataFrame
df = pd.DataFrame({
    'user_id': external_user_list,
    'gender': [user2gender.get(uid, 'unknown') for uid in external_user_list],
    'topk_items': [','.join(items) for items in external_item_lists],
    'topk_scores': [','.join([f'{s:.4f}' for s in scores]) for scores in score_lists]
})
display(df.head())
print(df.nunique())
df.to_csv('../rank_results/lastfm-nl/lastfm_top10_relabel.csv', index=False)
print("save lastfm_top10_relabel successfully")

10 Jun 17:13    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 42
state = INFO
reproducibility = True
data_path = datasets/lastfm-nl
checkpoint_dir = checkpoint_saved/lastfm-nl/
show_progress = False
save_dataset = True
dataset_save_path = None
save_dataloaders = True
dataloaders_save_path = None
log_wandb = True

Training Hyper Parameters:
epochs = 100
train_batch_size = 1024
learner = adam
learning_rate = 0.0005
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'order': 'RO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Precision', 'Recall', 'Hit', 'NDCG', 'ItemCoverage', 'AveragePopularity', 'ShannonEntropy', 'GiniIndex', 'TailPercentage']
topk = [10]
valid_metric = NDCG@10
va

Unnamed: 0,user_id,gender,topk_items,topk_scores
0,7db20cb306f5c6ee0a7da4b2eabbb12f80a4577d,M,"cfbc0924-0035-4d6c-8197-f024653af823,f82bcf78-...","9.9120,9.6598,9.6416,9.3764,9.3369,9.2641,8.89..."
1,715e75296006113fce5cb9a9a422228268d70a34,M,"73e5e69d-3554-40d8-8516-00cb38737a1c,13655113-...","9.8596,9.1807,9.0020,8.8318,8.7293,8.6927,8.63..."
2,54df0a3d2814018201eae050559617b7c15860ab,M,"d96be5ad-ad54-47f4-8676-f1771026d120,4ccbc3e7-...","8.7793,8.5365,8.4358,8.3247,8.1751,8.1424,8.06..."
3,5159f24173971f05fec1f19c9e1d08085de58c1e,F,"f82f3a3e-29c2-42ca-b589-bc5dc210fa9e,ada7a83c-...","8.2267,7.3153,7.1840,7.1734,7.1319,7.0165,6.99..."
4,0c28bcd3fa8cff593e6f8bea474e606233e23476,M,"cc197bad-dc9c-440d-a5b5-d52ba2e14234,b10bbbfc-...","8.3316,8.2216,8.0767,7.9532,7.8842,7.7576,7.52..."


user_id        8792
gender            2
topk_items     8791
topk_scores    8792
dtype: int64
save lastfm_top10_relabel successfully
