In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Apply FA*IR post-processing re-ranking to improve group fairness in top-K recommendations
# === Step 1: Load training interactions to calculate item frequency ===
train_inter_df = pd.read_csv('../datasets/split_datasets/ml-1m/ml-1m.train.inter', sep='\t')

# Count item frequency in training data
item_freq = train_inter_df['item_id:token'].value_counts()
item_freq.index = item_freq.index.astype(str)
print(f"Total unique items in training: {len(item_freq)}")

# Sort items by frequency (high to low)
item_freq_sorted = item_freq.sort_values(ascending=False)

# Define long-tail cutoff (e.g., bottom 20%)
tail_ratio = 0.2
tail_cutoff_index = int((1 - tail_ratio) * len(item_freq_sorted))
tail_item_ids = set(item_freq_sorted.index[tail_cutoff_index:])

print(f"Defined long-tail as the bottom {tail_ratio*100}%")
print(f"Number of long-tail items: {len(tail_item_ids)}")

Total unique items in training: 3683
Defined long-tail as the bottom 20.0%
Number of long-tail items: 737


In [15]:
# === Step 2: Define helper to detect whether item is long-tail ===
def is_protected_item(item_id):
    return item_id in tail_item_ids

In [16]:
# === Step 3: Load top-K recommendation results ===
topk_df = pd.read_csv('outputs/ml_all_user_top10.csv')  # Adjust path
user_df = pd.read_csv('../datasets/atomic_datasets/ml-1m/ml-1m.user', sep='\t')
user2gender = dict(zip(user_df['user_id:token'], user_df['gender:token']))

print(f"Loaded Top-K recommendations for {len(topk_df)} users")

Loaded Top-K recommendations for 6040 users


In [17]:
print("Top-k sample items:", topk_df['topk_items'].iloc[0])
print("Sample training item IDs:", train_inter_df['item_id:token'].astype(str).unique()[:10])

Top-k sample items: 260,1196,1210,2858,1270,1198,608,2571,593,1617
Sample training item IDs: ['3949' '1084' '1805' '3897' '3741' '39' '3911' '1061' '1197' '1285']


In [18]:
# === Step 4: Apply FA*IR-like re-ranking to increase long-tail item exposure ===
reranked_data = []
K = 10
desired_tail_ratio = 0.4  # e.g., at least 40% of items in Top-K are from the tail

total_long_tail_before = 0
total_long_tail_after = 0

for _, row in topk_df.iterrows():
    user_id = row['user_id']
    gender = user2gender.get(user_id, 'M')  # optional, not used for logic here
    item_ids = [int(i) for i in str(row['topk_items']).split(',')]

    # Count original number of long-tail items
    original_tail = sum(is_protected_item(i) for i in item_ids)
    total_long_tail_before += original_tail

    # Split into long-tail and non-long-tail
    tail_items = [i for i in item_ids if is_protected_item(i)]
    head_items = [i for i in item_ids if not is_protected_item(i)]

    # Re-rank: insert tail items first to meet desired ratio
    n_tail_needed = int(desired_tail_ratio * K)
    new_ranking = tail_items[:n_tail_needed] + head_items[:K - len(tail_items[:n_tail_needed])]
    new_ranking = new_ranking[:K]  # Ensure exactly K items

    total_long_tail_after += sum(is_protected_item(i) for i in new_ranking)

    reranked_data.append({
        'user_id': user_id,
        'gender': gender,
        'topk_items': ','.join(str(i) for i in new_ranking)
    })

In [22]:
# === Step 5: Save and report results ===
fair_topk_df = pd.DataFrame(reranked_data)
fair_topk_df.to_csv('outputs/ml_all_user_top10_rerank.csv', index=False)

print("\n=== Re-ranking Summary ===")
print(f"Total users processed: {len(topk_df)}")
print(f"Average long-tail items BEFORE re-ranking: {total_long_tail_before / len(topk_df):.4f}")
print(f"Average long-tail items AFTER  re-ranking: {total_long_tail_after / len(topk_df):.4f}")
print("Re-ranked results saved to: outputs/ml_all_user_top10_rerank.csv")


=== Re-ranking Summary ===
Total users processed: 6040
Average long-tail items BEFORE re-ranking: 0.0000
Average long-tail items AFTER  re-ranking: 0.0000
Re-ranked results saved to: outputs/ml_all_user_top10_rerank.csv


In [20]:
print(f"Tail item IDs (sample): {list(tail_item_ids)[:10]}")
print(f"Lowest item frequency in tail: {item_freq_sorted.iloc[tail_cutoff_index]}")

Tail item IDs (sample): ['375', '644', '2415', '2742', '392', '3866', '3373', '3001', '2221', '823']
Lowest item frequency in tail: 19


In [21]:
all_topk_items = set()
for items in topk_df['topk_items']:
    all_topk_items.update(str(i) for i in str(items).split(','))

overlap = all_topk_items.intersection(tail_item_ids)
print(f"Overlap between top-K and long-tail items: {len(overlap)}")

Overlap between top-K and long-tail items: 24
