# Neccessary Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from tqdm import tqdm
from Data_Splitter import build_data, create_user_track_matrix
from Fairness_Metrics import compute_recGap, compute_compounding_factor

# SLIM Model Training

In [4]:
def train_item(j, X, sample_weights, alpha, l1_ratio, max_iter, tol):
    """
    Helper function to train ElasticNet for one item (column).
    This function excludes item j from X and fits a model to predict column j.
    Returns the weight vector of length n_items with zero inserted at index j.
    """
    X_others = np.delete(X, j, axis=1)   # predictors (all items except j)
    y = X[:, j]                         # target vector for item j
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False,
                        positive=True, max_iter=max_iter, tol=tol)
    model.fit(X_others, y, sample_weight=sample_weights)
    coefs = model.coef_
    n_items = X.shape[1]
    w_j = np.zeros(n_items)
    w_j[:j] = coefs[:j]
    w_j[j] = 0  # enforce zero diagonal
    w_j[j+1:] = coefs[j:]
    return w_j

def train_slim(user_track_matrix, user_gender_map, female_weight=1.0,
                alpha=1e-3, l1_ratio=0.01, max_iter=1000, tol=1e-4):
    """
    Trains a SLIM model using ElasticNet per item with gender-aware sample weighting.
    This version parallelizes training over items.

    Parameters:
        user_track_matrix : DataFrame (rows: users, columns: items)
        user_gender_map   : Dictionary mapping user_id to gender
        female_weight     : Multiplier for female users' sample weights
        alpha             : Regularization strength
        l1_ratio          : Balance between L1 and L2 penalty
        max_iter, tol     : Solver settings

    Returns:
        W : Learned SLIM weight matrix (n_items x n_items) with zeros on the diagonal.
    """
    # Convert the interaction matrix to a dense NumPy array.
    X = user_track_matrix.values.astype(np.float32)
    n_users, n_items = X.shape

    # Build sample weights: female users receive a higher weight.
    sample_weights = np.array([
        female_weight if user_gender_map.get(user, 'm') == 'f' else 1.0
        for user in user_track_matrix.index
    ])

    print("Unique sample weights:", np.unique(sample_weights))


    # Parallelize training over each item (each column in X).
    results = Parallel(n_jobs=-1)(
        delayed(train_item)(j, X, sample_weights, alpha, l1_ratio, max_iter, tol)
        for j in tqdm(range(n_items), desc="Training items")
    )

    # Assemble the individual weight vectors into the weight matrix (columns correspond to items).
    W = np.column_stack(results)
    return W

# Helper Functions

In [5]:
def get_recommendations_for_user_slim(W, user_id, user_track_matrix, track_list, top_n=10):
    """
    For a given user, generate recommendations using the SLIM weight matrix.
    The prediction scores for a user are computed as:
        scores = user_vector dot W
    Items the user already interacted with are excluded from the recommendation list.

    Parameters:
        W                : SLIM weight matrix (n_items x n_items)
        user_id          : user identifier
        user_track_matrix: pandas DataFrame (users x items)
        track_list       : list of track IDs corresponding to the columns of user_track_matrix/W
        top_n            : number of recommendations to return

    Returns:
        recommended_items: list of recommended track IDs
    """
    if user_id not in user_track_matrix.index:
        return []

    # Get the binary interaction vector for the user.
    user_vector = user_track_matrix.loc[user_id].values.astype(float)   # shape: (n_items,)
    scores = np.dot(user_vector, W)  # predicted scores for each item

    # Exclude items already seen by setting their score to -infinity.
    user_history = set(user_track_matrix.loc[user_id][user_track_matrix.loc[user_id] > 0].index)
    for idx, track in enumerate(track_list):
        if track in user_history:
            scores[idx] = -np.inf

    # Get the indices of the top_n scores.
    recommended_indices = np.argsort(scores)[::-1][:top_n]
    recommended_tracks = [track_list[i] for i in recommended_indices if scores[i] != -np.inf]

    return recommended_tracks

# Evaluation Functions

In [6]:
def ndcg_at_k(relevances, k):
    """
    Compute NDCG@k given a list of binary relevance scores.
    """
    relevances = np.asarray(relevances, dtype=np.float64)[:k]
    if relevances.size == 0:
        return 0.0
    discounts = np.log2(np.arange(2, relevances.size + 2))
    dcg = np.sum(relevances / discounts)
    ideal_relevances = np.sort(relevances)[::-1]
    idcg = np.sum(ideal_relevances / discounts)
    return dcg / idcg if idcg > 0 else 0.0

def evaluate_ndcg_slim(W, df, holdout_df, user_track_matrix, track_list, top_n=10):
    """
    Evaluate recommendations using NDCG@k for each user in a holdout set.

    Parameters:
      W                : SLIM weight matrix.
      df               : Original DataFrame (assumes a 'gender' column exists).
      holdout_df       : DataFrame with ground-truth interactions.
      user_track_matrix: Training user–item matrix.
      track_list       : List of track IDs.
      top_n            : Number of recommendations.

    Returns:
      overall_ndcg   : Average NDCG@top_n over all users.
      ndcg_by_gender : Dictionary of average NDCG scores split by gender.
    """
    # Map each user to his/her ground-truth holdout items.
    user_holdout = holdout_df.groupby('user_id')['track_id'].apply(set).to_dict()
    # Extract user attributes (like gender).
    user_gender = df.set_index('user_id')['gender'].to_dict()

    ndcg_scores = {}
    ndcg_by_gender = {}

    for user, true_items in user_holdout.items():
        recs = get_recommendations_for_user_slim(W, user, user_track_matrix, track_list, top_n=top_n)
        relevances = [1 if rec in true_items else 0 for rec in recs]
        ndcg = ndcg_at_k(relevances, top_n)
        ndcg_scores[user] = ndcg

        gender = user_gender.get(user, 'unknown')
        ndcg_by_gender.setdefault(gender, []).append(ndcg)

    overall_ndcg = np.mean(list(ndcg_scores.values())) if ndcg_scores else 0.0
    avg_ndcg_by_gender = {g: np.mean(scores) for g, scores in ndcg_by_gender.items()}

    print("\nSet Evaluation:")
    print(f"Overall NDCG@{top_n}: {overall_ndcg:.4f}")
    print("NDCG by gender:", avg_ndcg_by_gender)

    return overall_ndcg, avg_ndcg_by_gender

def grid_search_validation_slim(user_track_matrix, track_list, df, val_holdout_df,
                                        user_gender_map, female_weight, candidate_alphas,
                                        fixed_l1_ratio=0.01, top_n=10):
    """
    Performs grid search over candidate alpha values.

    For each alpha, the SLIM model is trained and evaluated on the validation holdout set using NDCG.
    Returns:
        best_alpha  : The alpha value that produced the highest overall NDCG.
        best_ndcg   : The best NDCG value.
        grid_results: A list of tuples (alpha, overall_ndcg).
    """
    best_ndcg = -1.0
    best_alpha = None
    grid_results = []

    for alpha in candidate_alphas:
        W_candidate = train_slim(user_track_matrix, user_gender_map, female_weight=female_weight,
                                    alpha=alpha, l1_ratio=fixed_l1_ratio)
        overall_ndcg_val, _ = evaluate_ndcg_slim(W_candidate, df, val_holdout_df, user_track_matrix, track_list, top_n=top_n)
        grid_results.append((alpha, overall_ndcg_val))
        print(f"alpha: {alpha} => NDCG: {overall_ndcg_val:.4f}")
        if overall_ndcg_val > best_ndcg:
            best_ndcg = overall_ndcg_val
            best_alpha = alpha
    print("\nBest alpha:", best_alpha)
    print("Best overall NDCG on validation set:", best_ndcg)
    return best_alpha, best_ndcg, grid_results


def compute_diversity_for_list(recommended_tracks, user_track_matrix, track_list, W):
    """
    Compute intra-list diversity: average dissimilarity among all pairs of recommended tracks.
    Dissimilarity is defined as 1 - cosine similarity. Here we retrieve the
    item vectors (columns in the training matrix) for the recommended tracks.

    Parameters:
      recommended_tracks: list of track IDs.
      user_track_matrix: training user-item matrix.
      track_list       : list of track IDs.
      W                : weight matrix is not used directly here but one can also compute
                         diversity on the basis of the original item vectors.

    Returns:
      diversity score (float)
    """
    if len(recommended_tracks) < 2:
        return 0.0

    # For diversity we use the binary interaction vectors (or you could precompute item feature vectors)
    indices = [track_list.index(t) for t in recommended_tracks if t in track_list]
    # Extract the corresponding columns from the training matrix as item profiles.
    # Transpose the matrix so each row is an item vector.
    item_matrix = user_track_matrix.values.T
    vectors = item_matrix[indices]

    sim_matrix = cosine_similarity(vectors)

    # Compute average pairwise similarity (ignoring the diagonal).
    sum_similarity = 0.0
    count = 0
    n = len(indices)
    for i in range(n):
        for j in range(i+1, n):
            sum_similarity += sim_matrix[i, j]
            count += 1

    avg_similarity = sum_similarity / count if count > 0 else 0.0
    return 1 - avg_similarity

def evaluate_metrics_slim(W, df, holdout_df, user_track_matrix, track_list, top_n=10):
    """
    Evaluate recommendation metrics (Recall, Coverage, Diversity) for the holdout set.
    Also compute per-gender metrics using the user attributes from df.

    Parameters:
      W                : SLIM weight matrix.
      holdout_df       : DataFrame with ground-truth holdout interactions.
      user_track_matrix: training user–item matrix.
      track_list       : list of track IDs.
      df               : original DataFrame (assumes a 'gender' column).
      top_n            : number of recommendations.

    Returns:
      overall_recall, overall_coverage, overall_diversity, and a dictionary of per-gender metrics.
    """
    # Ground truth mapping.
    user_holdout = holdout_df.groupby('user_id')['track_id'].apply(set).to_dict()
    user_gender = df.set_index('user_id')['gender'].to_dict()

    recall_scores = {}
    diversity_scores = {}
    coverage_by_gender = {}

    for user, true_items in user_holdout.items():
        recs = get_recommendations_for_user_slim(W, user, user_track_matrix, track_list, top_n=top_n)
        # Recall@top_n computation.
        recall = len(set(recs).intersection(true_items)) / len(true_items) if true_items else 0.0
        recall_scores[user] = recall

        # Diversity computation.
        diversity = compute_diversity_for_list(recs, user_track_matrix, track_list, W)
        diversity_scores[user] = diversity

        gender = user_gender.get(user, 'unknown')
        coverage_by_gender.setdefault(gender, set()).update(recs)

    overall_recall = np.mean(list(recall_scores.values()))
    overall_diversity = np.mean(list(diversity_scores.values()))
    overall_coverage = len(set().union(*(recs for recs in coverage_by_gender.values()))) / len(track_list)

    # Average per-gender metrics.
    recall_by_gender = {}
    diversity_by_gender = {}
    coverage_metrics_by_gender = {}

    for user, rec in recall_scores.items():
        gender = user_gender.get(user, 'unknown')
        recall_by_gender.setdefault(gender, []).append(rec)
    for user, div in diversity_scores.items():
        gender = user_gender.get(user, 'unknown')
        diversity_by_gender.setdefault(gender, []).append(div)
    for gender, rec_set in coverage_by_gender.items():
        coverage_metrics_by_gender[gender] = len(rec_set) / len(track_list)

    avg_recall_by_gender = {g: np.mean(scores) for g, scores in recall_by_gender.items()}
    avg_diversity_by_gender = {g: np.mean(scores) for g, scores in diversity_by_gender.items()}

    print("\nEvaluation Metrics @ {}:".format(top_n))
    print("Overall Recall: {:.4f}".format(overall_recall))
    print("Recall by gender:", avg_recall_by_gender)
    print("\nOverall Coverage: {:.4f}".format(overall_coverage))
    print("Coverage by gender:", coverage_metrics_by_gender)
    print("\nOverall Diversity: {:.4f}".format(overall_diversity))
    print("Diversity by gender:", avg_diversity_by_gender)

    gender_metrics = {
        'recall': avg_recall_by_gender,
        'coverage': coverage_metrics_by_gender,
        'diversity': avg_diversity_by_gender
    }

    return overall_recall, overall_coverage, overall_diversity, gender_metrics

# Assuming recGap_CF_results, compute_recGap, and compute_compounding_factor are defined elsewhere:
def recGap_CF_results(df, gender_metrics):
    for key, value in gender_metrics.items():
        print(f"\nFor the {key} metric")
        compute_recGap(value)
        compute_compounding_factor(df, value)

# Main Functions

In [7]:
def Evaluate_SLIM(user_track_matrix, sparse_item_matrix, track_list, df, df_val_holdout, df_test_holdout,
                    user_gender_map, female_weight=1.0, top_n=10):

    # # Define candidate alpha values.
    # candidate_alphas = [1e-3, 1e-2, 1e-1]
    best_alpha = 1e-3

    # best_alpha, best_ndcg, grid_results = grid_search_validation_slim(
    #     user_track_matrix, track_list, df, df_val_holdout,
    #     user_gender_map, female_weight, candidate_alphas, fixed_l1_ratio=0.01, top_n=top_n)

    # Retrain final SLIM model with best alpha.
    W = train_slim(user_track_matrix, user_gender_map, female_weight=female_weight,
                    alpha=best_alpha, l1_ratio=0.01)

    overall_ndcg_test, ndcg_by_gender_test = evaluate_ndcg_slim(W, df, df_test_holdout, user_track_matrix, track_list, top_n=top_n)
    overall_recall, overall_coverage, overall_diversity, gender_metrics = evaluate_metrics_slim(W, df, df_test_holdout, user_track_matrix, track_list, top_n=top_n)
    gender_metrics['ndcg'] = ndcg_by_gender_test
    print("\nOverall gender metrics:", gender_metrics)
    recGap_CF_results(df, gender_metrics)

    return


def build_and_evaluate_slim(df, female_weight=1.0):
    # Prepare the data.
    df_model_train, df_val_holdout, df_test_holdout = build_data(df)
    user_track_matrix, sparse_item_matrix, track_list = create_user_track_matrix(df_model_train)

    # Create a mapping from user_id to gender (using the original dataframe).
    user_gender_map = df.set_index('user_id')['gender'].to_dict()

    # Train the SLIM model.
    W = train_slim(user_track_matrix, user_gender_map, female_weight=female_weight, alpha=1e-3, l1_ratio=0.01)

    # Evaluate the SLIM model.
    Evaluate_SLIM(user_track_matrix, sparse_item_matrix, track_list, df, df_val_holdout, df_test_holdout,
                user_gender_map, female_weight=female_weight, top_n=10)

    return

# Running the algorithm

In [8]:
df = pd.read_csv('/kaggle/input/lfm-data/LFM-1b-DemoBiasSub-10k.csv', header=0)
df_SMOTE = pd.read_csv('/kaggle/input/lfm-data/LFM-1b-DemoBiasSub-10k-SMOTE.csv', header=0)
df_resampled = pd.read_csv('/kaggle/input/lfm-data/LFM-1b-DemoBiasSub-10k-Resampled.csv', header=0)

In [9]:
build_and_evaluate_slim(df.sample(100000, random_state=42))

Training items: 100%|██████████| 9318/9318 [22:41<00:00,  6.84it/s]
Training items: 100%|██████████| 9318/9318 [22:36<00:00,  6.87it/s]



Set Evaluation:
Overall NDCG@10: 0.0679
NDCG by gender: {'f': np.float64(0.059183413945496685), 'm': np.float64(0.07097193570320222)}

Evaluation Metrics @ 10:
Overall Recall: 0.0846
Recall by gender: {'f': np.float64(0.0823868677905945), 'm': np.float64(0.08536693791440626)}

Overall Coverage: 0.5608
Coverage by gender: {'f': 0.2951277098089719, 'm': 0.5003219575016098}

Overall Diversity: 0.9831
Diversity by gender: {'f': np.float64(0.9841982789025328), 'm': np.float64(0.982764979307084)}

Overall gender metrics: {'recall': {'f': np.float64(0.0823868677905945), 'm': np.float64(0.08536693791440626)}, 'coverage': {'f': 0.2951277098089719, 'm': 0.5003219575016098}, 'diversity': {'f': np.float64(0.9841982789025328), 'm': np.float64(0.982764979307084)}, 'ndcg': {'f': np.float64(0.059183413945496685), 'm': np.float64(0.07097193570320222)}}

For the recall metric
RecGap Score: 0.0029800701238117677 

Original data distribution:
Males: 0.7573
Females: 0.2427

Metric score distribution:
Male

In [9]:
g_counts = df['gender'].value_counts()
total_users = len(df)
p = np.array([g_count / total_users for g_count in g_counts])
female_weight = p[0] / p[1]
print(f"weight adjustment = {female_weight:.2f}")

build_and_evaluate_slim(df.sample(100000, random_state=42), female_weight=female_weight)

weight adjustment = 3.12
Unique sample weights: [1.         3.12395779]


Training items: 100%|██████████| 9318/9318 [22:24<00:00,  6.93it/s]


Unique sample weights: [1.         3.12395779]


Training items: 100%|██████████| 9318/9318 [22:21<00:00,  6.95it/s]



Set Evaluation:
Overall NDCG@10: 0.0612
NDCG by gender: {'f': np.float64(0.057739183685021206), 'm': np.float64(0.062435948906443176)}

Evaluation Metrics @ 10:
Overall Recall: 0.0759
Recall by gender: {'f': np.float64(0.07932564330079858), 'm': np.float64(0.0747129294755877)}

Overall Coverage: 0.5539
Coverage by gender: {'f': 0.3102597123846319, 'm': 0.5038634900193174}

Overall Diversity: 0.9806
Diversity by gender: {'f': np.float64(0.9827795577087683), 'm': np.float64(0.9798619514881188)}

Overall gender metrics: {'recall': {'f': np.float64(0.07932564330079858), 'm': np.float64(0.0747129294755877)}, 'coverage': {'f': 0.3102597123846319, 'm': 0.5038634900193174}, 'diversity': {'f': np.float64(0.9827795577087683), 'm': np.float64(0.9798619514881188)}, 'ndcg': {'f': np.float64(0.057739183685021206), 'm': np.float64(0.062435948906443176)}}

For the recall metric
RecGap Score: 0.004612713825210876 

Original data distribution:
Males: 0.7573
Females: 0.2427

Metric score distribution:
M

In [10]:
build_and_evaluate_slim(df_SMOTE.sample(100000, random_state=42))

Training items: 100%|██████████| 8584/8584 [16:00<00:00,  8.94it/s]
Training items: 100%|██████████| 8584/8584 [16:00<00:00,  8.94it/s]



Set Evaluation:
Overall NDCG@10: 0.0546
NDCG by gender: {'m': np.float64(0.05548964193153566), 'f': np.float64(0.05242362548002217), nan: np.float64(0.0)}

Evaluation Metrics @ 10:
Overall Recall: 0.0719
Recall by gender: {'m': np.float64(0.07296924862199014), 'f': np.float64(0.06932740838270791), nan: np.float64(0.0)}

Overall Coverage: 0.5813
Coverage by gender: {'m': 0.5115330848089469, 'f': 0.3158201304753029, nan: 0.0033783783783783786}

Overall Diversity: 0.9824
Diversity by gender: {'m': np.float64(0.9824602744557103), 'f': np.float64(0.9821172744016281), nan: np.float64(0.9851229246599119)}

Overall gender metrics: {'recall': {'m': np.float64(0.07296924862199014), 'f': np.float64(0.06932740838270791), nan: np.float64(0.0)}, 'coverage': {'m': 0.5115330848089469, 'f': 0.3158201304753029, nan: 0.0033783783783783786}, 'diversity': {'m': np.float64(0.9824602744557103), 'f': np.float64(0.9821172744016281), nan: np.float64(0.9851229246599119)}, 'ndcg': {'m': np.float64(0.055489641931

In [9]:
build_and_evaluate_slim(df_resampled.sample(100000, random_state=42))

Training items: 100%|██████████| 9025/9025 [19:38<00:00,  7.66it/s]
Training items: 100%|██████████| 9025/9025 [19:32<00:00,  7.70it/s]



Set Evaluation:
Overall NDCG@10: 0.0580
NDCG by gender: {'m': np.float64(0.047327936047911065), 'f': np.float64(0.07469755618599669)}

Evaluation Metrics @ 10:
Overall Recall: 0.0684
Recall by gender: {'m': np.float64(0.06573248407643313), 'f': np.float64(0.07254304151894513)}

Overall Coverage: 0.5378
Coverage by gender: {'m': 0.4645983379501385, 'f': 0.30670360110803324}

Overall Diversity: 0.9817
Diversity by gender: {'m': np.float64(0.9812659496820578), 'f': np.float64(0.9822957512296286)}

Overall gender metrics: {'recall': {'m': np.float64(0.06573248407643313), 'f': np.float64(0.07254304151894513)}, 'coverage': {'m': 0.4645983379501385, 'f': 0.30670360110803324}, 'diversity': {'m': np.float64(0.9812659496820578), 'f': np.float64(0.9822957512296286)}, 'ndcg': {'m': np.float64(0.047327936047911065), 'f': np.float64(0.07469755618599669)}}

For the recall metric
RecGap Score: 0.006810557442511994 

Original data distribution:
Males: 0.50198
Females: 0.49802

Metric score distributio