In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

# IPython Magic Functions
# will make plot outputs appear and be stored within the notebook.
%matplotlib inline 

# Defaults fot better plots
plt.rcParams['figure.figsize']  = (18, 10)
plt.rcParams['axes.labelsize']  = 20
plt.rcParams['axes.titlesize']  = 20
plt.rcParams['legend.fontsize'] = 20
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['lines.linewidth'] = 4

# Turn on interactive mode
plt.ion() 
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.figsize']  = (12, 8)

# Loading datasets into Dataframes

In [2]:
dataset_name = "music4all"
dataset_relative_location = "./dataset"

dataset_path = Path(dataset_relative_location) / dataset_name

In [3]:
### Get datset CSVs
csvsPattern = "*.csv"
csvs = list(dataset_path.glob(csvsPattern))
csvs

[PosixPath('dataset/music4all/id_tags.csv'),
 PosixPath('dataset/music4all/id_genres.csv'),
 PosixPath('dataset/music4all/id_lang.csv'),
 PosixPath('dataset/music4all/listening_history.csv'),
 PosixPath('dataset/music4all/id_metadata.csv'),
 PosixPath('dataset/music4all/id_information.csv')]

In [4]:
datasets = {}

for csv in csvs:
    datasets[csv.stem] = pd.read_csv(csv, delimiter="\t")

In [5]:
# Taking a look at the datasets we have
datasets.keys()

dict_keys(['id_tags', 'id_genres', 'id_lang', 'listening_history', 'id_metadata', 'id_information'])

In [6]:
listening_history_copy = datasets['listening_history'].copy(deep=True)

In [7]:
# Renaming the column to ID so we can merge with the song info dataset
listening_history_copy.rename(columns={"song":"id"}, inplace=True)
listening_history_copy

Unnamed: 0,user,id,timestamp
0,user_007XIjOr,DaTQ53TUmfP93FSr,2019-02-20 12:28
1,user_007XIjOr,dGeyvi5WCOjDU7da,2019-02-20 12:35
2,user_007XIjOr,qUm54NYOjeFhmKYx,2019-02-20 12:48
3,user_007XIjOr,FtnuMT1DlevSR2n5,2019-02-20 12:52
4,user_007XIjOr,LHETTZcSZLeaVOGh,2019-02-20 13:09
...,...,...,...
5109587,user_zzWscYTy,BBiswLufo26YQCT7,2019-01-10 15:57
5109588,user_zzWscYTy,5ZHgff3sjETIiedr,2019-01-10 16:21
5109589,user_zzWscYTy,m4O1iLh6fC43xjRy,2019-01-10 16:48
5109590,user_zzWscYTy,mvUaP8k67qOFfA65,2019-01-10 21:13


In [8]:
song_info = pd.merge(datasets['id_information'], datasets['id_genres'], on='id')
song_info = pd.merge(song_info, datasets['id_metadata'], on='id') # has release date info
selected_song_info = song_info.drop(columns=["spotify_id", "danceability", "energy", "key", "mode", "valence", "tempo", "duration_ms"])
listening_history_complete = pd.merge(selected_song_info, listening_history_copy, on='id')
listening_history_complete

Unnamed: 0,id,artist,song,album_name,genres,popularity,release,user,timestamp
0,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,user_39RWe73b,2019-01-06 16:56
1,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,user_6golQVBC,2019-02-13 11:47
2,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,user_95s7ZWFG,2019-02-11 11:36
3,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,user_Avd2E7q2,2019-01-07 21:34
4,0009fFIM1eYThaPg,Cheryl,Rain on Me,3 Words,pop,12.0,2009,user_CmTctYpP,2019-02-27 17:41
...,...,...,...,...,...,...,...,...,...
5109587,zzzwh2ktIWjsR7xp,Snow Patrol,In the End,Fallen Empires,"indie rock,alternative rock,rock",27.0,2011,user_MzHAWiW5,2019-01-21 08:59
5109588,zzzwh2ktIWjsR7xp,Snow Patrol,In the End,Fallen Empires,"indie rock,alternative rock,rock",27.0,2011,user_MzHAWiW5,2019-01-30 19:30
5109589,zzzwh2ktIWjsR7xp,Snow Patrol,In the End,Fallen Empires,"indie rock,alternative rock,rock",27.0,2011,user_b5Xfilqj,2019-02-17 17:11
5109590,zzzwh2ktIWjsR7xp,Snow Patrol,In the End,Fallen Empires,"indie rock,alternative rock,rock",27.0,2011,user_dowNLkkH,2019-01-06 11:43


In [9]:
pd.to_datetime(listening_history_complete["timestamp"], format="%Y-%m-%d %H:%M")
#selected_song_info.query("user == 'user_39RWe73b'").sort_values(by=["timestamp"])["song"].to_list()


0         2019-01-06 16:56:00
1         2019-02-13 11:47:00
2         2019-02-11 11:36:00
3         2019-01-07 21:34:00
4         2019-02-27 17:41:00
                  ...        
5109587   2019-01-21 08:59:00
5109588   2019-01-30 19:30:00
5109589   2019-02-17 17:11:00
5109590   2019-01-06 11:43:00
5109591   2019-03-07 19:54:00
Name: timestamp, Length: 5109592, dtype: datetime64[ns]

In [10]:
# Sorting dataset so any query will result in the correct order for users
listening_history_complete = listening_history_complete.sort_values(by=["timestamp"])

In [13]:
def get_user_song_list(user, max_songs=50):
    return listening_history_complete.query(f"user == '{user}'")["song"].to_list()[:max_songs]

def get_user_song_id_list(max_songs=50):
    return listening_history_complete.query("user == 'user_39RWe73b'")["id"].to_list()[:max_songs]

In [14]:
from tqdm import tqdm

unique_users = listening_history_complete["user"].unique()

listening_history_ngram = []

count = 0

for user in tqdm(unique_users[:]):
    listening_history_ngram.append(get_user_song_list(user))

100%|██████████| 14127/14127 [39:07<00:00,  6.02it/s]


In [15]:
len(listening_history_ngram)

14127

In [16]:
with open(f"listening_history_{len(listening_history_ngram)}.ngram", "w") as file:
    for row in listening_history_ngram:
        file.write(','.join([str(a) for a in row]) + '\n')