# This notebook has been created to analyze the Spotify dataset

In [11]:
import os, sys
sys.path.insert(0, os.getcwd()+ '/..') 

import pandas as pd
import json
from collections import namedtuple

from gather_datasets.utils import get_files
from sys import getsizeof

from tqdm import tqdm

In [12]:
ratings_df = pd.read_csv(f'../datasets/spotify/ratings.csv.gz')
ratings_df

Unnamed: 0,playlist_id,item_id
0,549000,0
1,549000,1
2,549000,2
3,549000,3
4,549000,4
...,...,...
66346423,302999,399233
66346424,302999,11291
66346425,302999,93795
66346426,302999,133087


In [13]:
tracks_df = pd.read_csv(f'../datasets/spotify/tracks.csv.gz')

In [14]:
pd.set_option('display.width', 300)
tracks_df

Unnamed: 0,item_id,item_uri,item_name,artist_uri,artist_name,album_uri
0,0,spotify:track:6QHYEZlm9wyfXfEM1vSu1P,Boots of Spanish Leather,spotify:artist:74ASZWbe4lXaubB36ztrGX,Bob Dylan,spotify:album:7DZeLXvr9eTVpyI1OlqtcS
1,1,spotify:track:3RkQ3UwOyPqpIiIvGVewuU,Mr. Tambourine Man,spotify:artist:74ASZWbe4lXaubB36ztrGX,Bob Dylan,spotify:album:1lPoRKSgZHQAYXxzBsOQ7v
2,2,spotify:track:0ju1jP0cSPJ8tmojYBEI89,Danny's Song,spotify:artist:7emRV8AluG3d4e5T0DZiK9,Loggins & Messina,spotify:album:5BWgJaesMjpJWCTU9sgUPf
3,3,spotify:track:7ny2ATvjtKszCpLpfsGnVQ,A Hard Rain's A-Gonna Fall,spotify:artist:74ASZWbe4lXaubB36ztrGX,Bob Dylan,spotify:album:0o1uFxZ1VTviqvNaYkTJek
4,4,spotify:track:18GiV1BaXzPVYpp9rmOg0E,Blowin' In the Wind,spotify:artist:74ASZWbe4lXaubB36ztrGX,Bob Dylan,spotify:album:0o1uFxZ1VTviqvNaYkTJek
...,...,...,...,...,...,...
2262287,2262287,spotify:track:2wBZrBqWQ4eIhShITfUJ4c,My Favourite Muse,spotify:artist:6g8Jqb5JMfv92eB2r0awTN,Arab Strap,spotify:album:7maGHx2VJGTBPczcWT9JVm
2262288,2262288,spotify:track:2t7fotSpsiWHpSTySbSNZg,The Stars and Stripes Forever,spotify:artist:0LyfQWJT6nXafLPZqxe9Of,Various Artists,spotify:album:5HuTwuPKjQKNpmLmmi4Njx
2262289,2262289,spotify:track:0EW4RltERtn276lOi0DXQj,Killer,spotify:artist:6x3HJm6n40OUW2ZcmttBxQ,Faragó,spotify:album:5aauU2AK4iG01BxIchws8Q
2262290,2262290,spotify:track:5RiCId3jJs8D9g6Fv4A1Su,Robin Hood,spotify:artist:4UrrAgW6WW7UfZ760eyHe7,Crazy Fool,spotify:album:2MUG8V88S5mazH8n45AKuZ


In [15]:
TrackInfo = namedtuple('TrackInfo', ['track_id', 'track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri'])

tracks = {}
playlists = []
additional_playlist_info = []

next_id = 0

for file in tqdm(get_files('../datasets/downloads/spotify/data')):
    with open(file, 'r') as f:
        data = json.load(f)
        playlists_data = data['playlists']
        for playlist in playlists_data:
            
            pid = playlist['pid']
            timestamp = playlist['modified_at']
            collaborative = playlist['collaborative'] == 'true'
            additional_playlist_info.append((pid, timestamp, collaborative))

            tracks_data = playlist['tracks']
            for track in tracks_data:
                track_uri = track['track_uri']
                
                id = None
                if track_uri in tracks:
                    id = tracks[track_uri].track_id
                else:
                    id = next_id
                    next_id += 1
                    tracks[track_uri] = TrackInfo(id, track_uri, track['track_name'], track['artist_uri'], track['artist_name'], track['album_uri'])
                playlists.append((pid, id))

100%|██████████| 1000/1000 [02:48<00:00,  5.94it/s]


In [16]:
playlists_df = pd.DataFrame.from_records(additional_playlist_info, columns=['playlist_id', 'timestamp', 'collaborative'])

ratings_df = pd.DataFrame.from_records(playlists, columns=['playlist_id', 'track_id'])

tracks_df = pd.DataFrame.from_records(list(tracks.values()), columns=TrackInfo._fields)
tracks_df.reset_index(inplace=True, drop=True)
tracks_df.set_index('track_id', inplace=True)


In [17]:
playlists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype
---  ------         --------------    -----
 0   playlist_id    1000000 non-null  int64
 1   timestamp      1000000 non-null  int64
 2   collaborative  1000000 non-null  bool 
dtypes: bool(1), int64(2)
memory usage: 16.2 MB


In [18]:
value_counts = playlists_df['collaborative'].value_counts()
print(f'Collaborative playlists ratio percent: {value_counts[True] / value_counts[False] * 100}%')

Collaborative playlists ratio percent: 2.3090120939483194%


In [19]:
print(f'Number of ratings: {len(ratings_df)}')

Number of ratings: 66346428


In [20]:
print(f'Number of items: {len(tracks)}')

Number of items: 2262292
