In [None]:
import librosa
import pandas as pd

# Path to dataset
dataset_path = ''

music_list = pd.read_csv(dataset_path + 'music_list.csv')

user_behavior_list = pd.read_csv(dataset_path + 'user_behavior_list.csv')

In [4]:
#Grouping songs of each user

user_song_list = (
  user_behavior_list
  .groupby('user_id', observed=True)[['track_id', 'playcount']]
  .apply(lambda x: list(zip(x['track_id'], x['playcount'])))
  .to_dict()
)

In [5]:
user_song_list = {user: songs for user, songs in user_song_list.items() if len(songs) >= 50}

In [6]:
user_behavior_list = user_behavior_list[user_behavior_list['user_id'].isin(user_song_list.keys())]
user_behavior_list.shape

print(user_behavior_list)

                   track_id                                   user_id  \
121      TRLATHU128F92FC275  5a905f000fc1ff3df7ca807d57edb608863db05d   
122      TRMKFPN128F42858C3  5a905f000fc1ff3df7ca807d57edb608863db05d   
123      TRTSSUT128F1472A51  5a905f000fc1ff3df7ca807d57edb608863db05d   
124      TRNJLKP128F427CE28  5a905f000fc1ff3df7ca807d57edb608863db05d   
125      TRGAOLV128E0789D40  5a905f000fc1ff3df7ca807d57edb608863db05d   
...                     ...                                       ...   
9711269  TRGCHLH12903CB7352  8305c896f42308824da7d4386f4b9ee584281412   
9711270  TRVSJOM12903CD2DC1  8305c896f42308824da7d4386f4b9ee584281412   
9711271  TRAALAH128E078234A  8305c896f42308824da7d4386f4b9ee584281412   
9711272  TRTKLFX12903CD2DC2  8305c896f42308824da7d4386f4b9ee584281412   
9711273  TRZYAGJ128F9332CEF  8305c896f42308824da7d4386f4b9ee584281412   

         playcount  
121             11  
122              2  
123              1  
124              1  
125              2

In [7]:
import numpy as np

# Assuming 'user_song_list' contains the users with listening history >= 50 songs
active_users = list(user_song_list.keys())

# Sample 5000 active users
rng = np.random.default_rng(seed=42)
sampled_users = rng.choice(active_users, size=5000, replace=False)

# Filter the user_behavior_list DataFrame to include only sampled users
user_behavior_list = user_behavior_list[user_behavior_list['user_id'].isin(sampled_users)]

user_behavior_list.shape

(377381, 3)

In [8]:
from sklearn.utils import shuffle

user_behavior_list = shuffle(user_behavior_list, random_state=42)

In [9]:
from sklearn.model_selection import train_test_split

# Perform train-test split on a per-user basis
train_df = user_behavior_list.groupby('user_id', group_keys=False).apply(lambda x: x.sample(frac=0.8, random_state=42))
test_df = user_behavior_list.drop(train_df.index)

print("Shape of train_df:", train_df.shape)
print("Shape of test_df:", test_df.shape)

Shape of train_df: (301961, 3)
Shape of test_df: (75420, 3)


  train_df = user_behavior_list.groupby('user_id', group_keys=False).apply(lambda x: x.sample(frac=0.8, random_state=42))


In [10]:
# Instead of sampling from the original music_list,
# filter music_list to include only the unique tracks present in the user_behavior_list
# after filtering for active users.

# Get the unique track IDs from the user_behavior_list (which has already been filtered for active users)
unique_tracks_in_behavior = user_behavior_list['track_id'].unique()

# Filter the original music_list to keep only these unique tracks
sampled_music_list = music_list[music_list['track_id'].isin(unique_tracks_in_behavior)].copy()

print("Shape of the new sampled_music_list (unique tracks from user behavior):", sampled_music_list.shape)

Shape of the new sampled_music_list (unique tracks from user behavior): (21087, 21)


In [11]:
# Calculate the total playcount for each track in the user_behavior_list
track_popularity = user_behavior_list.groupby('track_id')['playcount'].sum().reset_index()
track_popularity.rename(columns={'playcount': 'total_playcount'}, inplace=True)

# Merge the popularity information with the sampled_music_list
# We use a left merge to keep all tracks in sampled_music_list,
# filling any missing total_playcount with 0 (for tracks that were not in the sampled user behavior)
sampled_music_list_with_popularity = pd.merge(sampled_music_list, track_popularity, on='track_id', how='left')
sampled_music_list_with_popularity['total_playcount'] = sampled_music_list_with_popularity['total_playcount'].fillna(0).astype(int)

# Sort the songs by total playcount in descending order
popularity_sorted_music = sampled_music_list_with_popularity.sort_values(by='total_playcount', ascending=False)

# Display the top 10 most popular songs
print("Top 10 Most Popular Songs:")
display(popularity_sorted_music[['name', 'artist', 'total_playcount']].head(10))

Top 10 Most Popular Songs:


Unnamed: 0,name,artist,total_playcount
96,Revelry,Kings of Leon,2742
14537,MIA,Chevelle,2163
20,Float On,Modest Mouse,1781
317,Help I'm Alive,Metric,1714
23,Such Great Heights,The Postal Service,1713
123,Rabbit Heart (Raise It Up),Florence + the Machine,1336
905,Lights & Music,Cut Copy,1257
719,DVNO,Justice,1252
2037,Skinny Love,Bon Iver,1250
1001,Fancy Footwork,Chromeo,1149
