In [1]:
import pandas as pd
import numpy as np
from Data_Splitter import build_data, create_user_track_matrix
from Fairness_Metrics import compute_recGap, compute_compounding_factor
from sklearn.metrics.pairwise import cosine_similarity

# Recommendation Functions (Popularity-Based)

In [2]:
def compute_track_popularity(user_track_matrix):
    """
    Compute a popularity score for each track by summing user interactions.
    Returns a Series sorted by descending popularity.
    """
    pop_series = user_track_matrix.sum(axis=0).sort_values(ascending=False)
    return pop_series


def get_recommendations_for_user_pop(user_id, user_track_matrix, track_popularity, top_n=10):
    """
    Recommend the top-N most popular tracks (global popularity) that the user has not interacted with.
    """
    if user_id not in user_track_matrix.index:
        return []
    
    # Tracks that the user already interacted with.
    user_history = set(user_track_matrix.loc[user_id][lambda row: row == 1].index)
    
    recommendations = []
    # Iterate over tracks in order of popularity.
    for track in track_popularity.index:
        if track not in user_history:
            recommendations.append(track)
        if len(recommendations) >= top_n:
            break
    return recommendations

# Evaluation Functions

In [3]:
def ndcg_at_k(relevances, k):
    """
    Compute NDCG@k given a list of binary relevance scores.
    """
    relevances = np.asfarray(relevances)[:k]
    if relevances.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, relevances.size + 2))
    dcg = np.sum(relevances / discounts)
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum(ideal_relevances / discounts)
    return dcg / idcg if idcg > 0 else 0.0

def evaluate_ndcg_pop(df, holdout_df, user_track_matrix, track_popularity, top_n=10):
    """
    Evaluate NDCG@k for the popularity recommender on a holdout set.
    Returns overall NDCG and NDCG by gender.
    """
    # Create mapping from user_id to their holdout (ground truth) track_ids.
    user_holdout = holdout_df.groupby('user_id')['track_id'].apply(set).to_dict()
    user_gender = df.set_index('user_id')['gender'].to_dict()

    ndcg_scores = {}    # per user scores
    ndcg_by_gender = {} # aggregated scores per gender

    for user, true_items in user_holdout.items():
        recs = get_recommendations_for_user_pop(user, user_track_matrix, track_popularity, top_n=top_n)
        relevances = [1 if rec in true_items else 0 for rec in recs]
        ndcg = ndcg_at_k(relevances, top_n)
        ndcg_scores[user] = ndcg

        gender = user_gender.get(user, 'unknown')
        ndcg_by_gender.setdefault(gender, []).append(ndcg)

    overall_ndcg = np.mean(list(ndcg_scores.values())) if ndcg_scores else 0.0
    avg_ndcg_by_gender = {gender: np.mean(scores) for gender, scores in ndcg_by_gender.items()}

    print("\nSet Evaluation:")
    print(f"Overall NDCG@{top_n}: {overall_ndcg:.4f}")
    print("NDCG by gender:", avg_ndcg_by_gender)

    return overall_ndcg, avg_ndcg_by_gender

def evaluate_metrics_pop(df, holdout_df, user_track_matrix, track_popularity, sparse_item_matrix, track_list, top_n=10):
    """
    Evaluate recommendations on a holdout set using Recall, Coverage, and Diversity metrics.
    """
    user_holdout = holdout_df.groupby('user_id')['track_id'].apply(set).to_dict()
    user_gender = df.set_index('user_id')['gender'].to_dict()
    
    recall_scores = {}
    diversity_scores = {}
    coverage_by_gender = {}  # For coverage per gender.
    
    for user, true_items in user_holdout.items():
        recs = get_recommendations_for_user_pop(user, user_track_matrix, track_popularity, top_n=top_n)
        
        # Compute Recall@top_n
        recall = len(set(recs).intersection(true_items)) / len(true_items) if true_items else 0.0
        recall_scores[user] = recall
        
        # For Diversity, we still use the sparse item representations and cosine similarity.
        diversity = compute_diversity_for_list(recs, sparse_item_matrix, track_list)
        diversity_scores[user] = diversity
        
        # Collect recommended items per gender.
        gender = user_gender.get(user, 'unknown')
        coverage_by_gender.setdefault(gender, set()).update(recs)
    
    overall_recall = np.mean(list(recall_scores.values()))
    overall_diversity = np.mean(list(diversity_scores.values()))
    overall_coverage = len(set().union(*(recs for recs in coverage_by_gender.values()))) / len(track_popularity)
    
    recall_by_gender = {}
    diversity_by_gender = {}
    coverage_metrics_by_gender = {}
    
    # Organize per-user metrics by gender.
    for user, rec in recall_scores.items():
        gender = user_gender.get(user, 'unknown')
        recall_by_gender.setdefault(gender, []).append(rec)
    
    for user, div in diversity_scores.items():
        gender = user_gender.get(user, 'unknown')
        diversity_by_gender.setdefault(gender, []).append(div)
    
    for gender, rec_set in coverage_by_gender.items():
        coverage_metrics_by_gender[gender] = len(rec_set) / len(track_popularity)
    
    avg_recall_by_gender = {g: np.mean(scores) for g, scores in recall_by_gender.items()}
    avg_diversity_by_gender = {g: np.mean(scores) for g, scores in diversity_by_gender.items()}

    print("\nEvaluation Metrics @ {}:".format(top_n))
    print("\nOverall Recall: {:.4f}".format(overall_recall))
    print("Recall by gender:", avg_recall_by_gender)

    print("\nOverall Coverage: {:.4f}".format(overall_coverage))
    print("Coverage by gender:", coverage_metrics_by_gender)

    print("\nOverall Diversity: {:.4f}".format(overall_diversity))
    print("Diversity by gender:", avg_diversity_by_gender)
    
    gender_metrics = {
        'recall': avg_recall_by_gender,
        'coverage': coverage_metrics_by_gender,
        'diversity': avg_diversity_by_gender
    }
    
    return overall_recall, overall_coverage, overall_diversity, gender_metrics

def compute_diversity_for_list(recommended_tracks, sparse_item_matrix, track_list):
    """
    Compute intra-list diversity as the average pairwise dissimilarity (1 - cosine similarity)
    among recommended tracks.
    """
    if len(recommended_tracks) < 2:
        return 0.0

    # Get indices for the recommended tracks.
    indices = [track_list.index(t) for t in recommended_tracks if t in track_list]
    if not indices:
        return 0.0

    # Extract item vectors using the sparse matrix.
    vectors = sparse_item_matrix[indices]
    
    # Compute pairwise cosine similarity.
    sim_matrix = cosine_similarity(vectors)
    
    sum_similarity = 0.0
    count = 0
    n = len(indices)
    for i in range(n):
        for j in range(i + 1, n):
            sum_similarity += sim_matrix[i, j]
            count += 1
    avg_similarity = sum_similarity / count if count > 0 else 0.0
    return 1 - avg_similarity

def recGap_CF_results(df, gender_metrics):
    for key, value in gender_metrics.items():
        print(f"\nFor the {key} metric")
        compute_recGap(value)
        compute_compounding_factor(df, value)

# Main Functions

In [4]:
def Evaluate_pop(user_track_matrix, sparse_item_matrix, track_list, track_popularity, df, df_val_holdout, df_test_holdout):
    # Evaluate NDCG on the validation and test holdout sets.
    overall_ndcg_val, ndcg_by_gender_val = evaluate_ndcg_pop(df, df_val_holdout, user_track_matrix, track_popularity, top_n=10)
    overall_ndcg_test, ndcg_by_gender_test = evaluate_ndcg_pop(df, df_test_holdout, user_track_matrix, track_popularity, top_n=10)
    
    # Evaluate other metrics on the test set.
    overall_recall, overall_coverage, overall_diversity, gender_metrics = evaluate_metrics_pop(
        df, df_test_holdout, user_track_matrix, track_popularity, sparse_item_matrix, track_list, top_n=10)
    
    gender_metrics['ndcg'] = ndcg_by_gender_test 
    print("\nAggregated Gender Metrics:", gender_metrics)
    recGap_CF_results(df, gender_metrics)

def build_and_evaluate_pop(df):
    # Build training and holdout datasets.
    df_model_train, df_val_holdout, df_test_holdout = build_data(df)
    
    # Create user–track interaction matrix and its sparse representation.
    user_track_matrix, sparse_item_matrix, track_list = create_user_track_matrix(df_model_train)
    
    # Compute track popularity from the training data.
    track_popularity = compute_track_popularity(user_track_matrix)
    
    # Evaluate the popularity-based recommender.
    Evaluate_pop(user_track_matrix, sparse_item_matrix, track_list, track_popularity, df, df_val_holdout, df_test_holdout)

# Running the algorithm

In [5]:
df = pd.read_csv('data/LFM-1b-DemoBiasSub-10k.csv', header=0)
df_SMOTE = pd.read_csv('data/LFM-1b-DemoBiasSub-10k-SMOTE.csv', header=0)
df_resampled = pd.read_csv('data/LFM-1b-DemoBiasSub-10k-Resampled.csv', header=0)

In [6]:
build_and_evaluate_pop(df)


Set Evaluation:
Overall NDCG@10: 0.0534
NDCG by gender: {'m': 0.04757433285532539, 'f': 0.06948440699081211}

Set Evaluation:
Overall NDCG@10: 0.0517
NDCG by gender: {'m': 0.04676159557182249, 'f': 0.06512431843008354}

Evaluation Metrics @ 10:

Overall Recall: 0.0506
Recall by gender: {'m': 0.04404866594119496, 'f': 0.06854091957992486}

Overall Coverage: 0.0017
Coverage by gender: {'m': 0.0016, 'f': 0.0017}

Overall Diversity: 0.9269
Diversity by gender: {'m': 0.9268554270548213, 'f': 0.9271297065656413}

Aggregated Gender Metrics: {'recall': {'m': 0.04404866594119496, 'f': 0.06854091957992486}, 'coverage': {'m': 0.0016, 'f': 0.0017}, 'diversity': {'m': 0.9268554270548213, 'f': 0.9271297065656413}, 'ndcg': {'m': 0.04676159557182249, 'f': 0.06512431843008354}}

For the recall metric
RecGap Score: 0.0244922536387299 

Original data distribution:
Males: 0.7575144921328422
Females: 0.24248550786715783

Metric score distribution:
Males: 0.6675144983401606
Females: 0.33248550165983937

Co

In [31]:
build_and_evaluate_pop(df_resampled)


Set Evaluation:
Overall NDCG@10: 0.0409
NDCG by gender: {'m': 0.05287491089839556, 'f': 0.011427286580498248}

Set Evaluation:
Overall NDCG@10: 0.0337
NDCG by gender: {'m': 0.04326116044305975, 'f': 0.009130876132999616}

Evaluation Metrics @ 10:

Overall Recall: 0.0342
Recall by gender: {'m': 0.04589248731300294, 'f': 0.003937465273216622}

Overall Coverage: 0.0018
Coverage by gender: {'m': 0.0018, 'f': 0.0016}

Overall Diversity: 0.9249
Diversity by gender: {'m': 0.9248248019885202, 'f': 0.9249955221462198}

Aggregated Gender Metrics: {'recall': {'m': 0.04589248731300294, 'f': 0.003937465273216622}, 'coverage': {'m': 0.0018, 'f': 0.0016}, 'diversity': {'m': 0.9248248019885202, 'f': 0.9249955221462198}, 'ndcg': {'m': 0.04326116044305975, 'f': 0.009130876132999616}}

For the recall metric
RecGap Score: 0.041955022039786315 

Original data distribution:
Males: 0.5
Females: 0.5

Metric score distribution:
Males: 0.9209819582628797
Females: 0.07901804173712028

Compounding Factor (KL Div

In [8]:
build_and_evaluate_pop(df_SMOTE)


Set Evaluation:
Overall NDCG@10: 0.0517
NDCG by gender: {'m': 0.04838376903446011, 'f': 0.058016290029810666, nan: 0.04483920730746311}

Set Evaluation:
Overall NDCG@10: 0.0531
NDCG by gender: {'m': 0.047272149416857055, 'f': 0.06072342054679076, nan: 0.13250983570755606}

Evaluation Metrics @ 10:

Overall Recall: 0.0500
Recall by gender: {'m': 0.046319738373610427, 'f': 0.056149161193642334, nan: 0.06536078717201166}

Overall Coverage: 0.0016
Coverage by gender: {'m': 0.0016, 'f': 0.0016, nan: 0.0015}

Overall Diversity: 0.9267
Diversity by gender: {'m': 0.9266350563529345, 'f': 0.9268319943942741, nan: 0.9278840605144505}

Aggregated Gender Metrics: {'recall': {'m': 0.046319738373610427, 'f': 0.056149161193642334, nan: 0.06536078717201166}, 'coverage': {'m': 0.0016, 'f': 0.0016, nan: 0.0015}, 'diversity': {'m': 0.9266350563529345, 'f': 0.9268319943942741, nan: 0.9278840605144505}, 'ndcg': {'m': 0.047272149416857055, 'f': 0.06072342054679076, nan: 0.13250983570755606}}

For the recal