# Neccessary Imports

In [3]:
import pandas as pd
import numpy as np
import gc

In [4]:
# To handle the enormous events data without memory issues
data_folder_path = 'Data/LFM-1B/'

def reduce_memory(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif col_type == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif col_type == 'object':
            num_unique = df[col].nunique()
            num_total = len(df[col])
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')
    return df

# Loading the original data

In [None]:
# Load the listening events data
events = reduce_memory(pd.read_csv(data_folder_path + "events.tsv", sep="\t", header=0))
# Filter needed columns only
events = events[["user_id","artist_id", "track_id"]]
# Aggregate play counts for each user-artist-track combination
events = events.groupby(["user_id", "artist_id", "track_id"]).agg(play_count=("track_id", "count")).reset_index()

In [None]:
# Load demographic data about users
demo = pd.read_csv("LFM-BeyMS\creation\LFM-1b_users.txt", sep="\t", header=0)
# Filter needed columns only
demo = demo[["user_id", "country", "age", "gender", "registered_unixtime"]]

# Applying Filtering Approaches

In [5]:
# This is the first filtering step described in the paper to create the LFM-1b-DemoBias data
demo_filtered = demo[demo['gender'].isin(['f', 'm'])]   # Filter to include only rows where gender is 'f' or 'm'

# Merge the two datasets on 'user_id' using inner join to create the LFM_1b_DemoBias data
LFM_1b_DemoBias = pd.merge(events, demo_filtered, on='user_id', how='inner')

In [7]:
# Remove low-frequency interactions
df_filtered = LFM_1b_DemoBias[LFM_1b_DemoBias['play_count'] >= 5]

In [8]:
# The second filtering step is to remove users and tracks with low interaction counts.

''' 
In many real-world scenarios, applying one filter can affect the counts in the other.
For example, after removing a user, some tracks might fall below the 5-user threshold.
In such cases, an iterative approach ensures that both conditions remain valid.

'''

prev_shape = None
while prev_shape != df_filtered.shape:
    prev_shape = df_filtered.shape
    
    # Filter tracks: keep only those listened to by at least 5 unique users
    track_user_counts = df_filtered.groupby('track_id')['user_id'].nunique()
    valid_tracks = track_user_counts[track_user_counts >= 5].index
    df_filtered = df_filtered[df_filtered['track_id'].isin(valid_tracks)]
    
    # Filter users: keep only those who listened to at least 5 unique tracks
    user_track_counts = df_filtered.groupby('user_id')['track_id'].nunique()
    valid_users = user_track_counts[user_track_counts >= 5].index
    df_filtered = df_filtered[df_filtered['user_id'].isin(valid_users)]


# Verify that each track has at least 5 unique users
print(df_filtered.groupby('track_id')['user_id'].nunique().min())
# Verify that each user has listened to at least 5 unique tracks
print(df_filtered.groupby('user_id')['track_id'].nunique().min())

5
5


In [None]:
# Third step is to sample the data to create a manageable dataset for training.
n_tracks = 10000    # We have reduced this number to 10k instead of 100k due to our limited computational resources.
sampled_track_ids = df_filtered['track_id'].drop_duplicates().sample(n=n_tracks, random_state=42)
df_sampled = df_filtered[df_filtered['track_id'].isin(sampled_track_ids)]

print(df_sampled['user_id'].nunique())
print(df_sampled['gender'].value_counts())

39156
gender
m    265274
f     84916
Name: count, dtype: int64


In [18]:
# Save the sampled data to a CSV file
df_sampled.to_csv("Data/LFM-1b-DemoBiasSub-10k.csv", index=False)