In [10]:
import os
from dotenv import load_dotenv
import pandas as pd
import spotipy
from tqdm import tqdm
from spotipy.oauth2 import SpotifyClientCredentials

from utils.etl import get_playlist_info, get_track_info, get_tracks_from_playlists

In [2]:
genres_seeds = [
    "acoustic", "afrobeat", "alt-rock", "alternative", "ambient", "anime", "black-metal", 
    "bluegrass", "blues", "bossanova", "brazil", "breakbeat", "british", "cantopop", 
    "chicago-house", "children", "chill", "classical", "club", "comedy", "country", "dance", 
    "dancehall", "death-metal", "deep-house", "detroit-techno", "disco", "disney", 
    "drum-and-bass", "dub", "dubstep", "edm", "electro", "electronic", "emo", "folk", "forro", 
    "french", "funk", "garage", "german", "gospel", "goth", "grindcore", "groove", "grunge", 
    "guitar", "happy", "hard-rock", "hardcore", "hardstyle", "heavy-metal", "hip-hop", "holidays", 
    "honky-tonk", "house", "idm", "indian", "indie", "indie-pop", "industrial", "iranian", 
    "j-dance", "j-idol", "j-pop", "j-rock", "jazz", "k-pop", "kids", "latin", "latino", "malay", 
    "mandopop", "metal", "metal-misc", "metalcore", "minimal-techno", "movies", "mpb", "new-age", 
    "new-release", "opera", "pagode", "party", "philippines-opm", "piano", "pop", "pop-film", 
    "post-dubstep", "power-pop", "progressive-house", "psych-rock", "punk", "punk-rock", "r-n-b", 
    "rainy-day", "reggae", "reggaeton", "road-trip", "rock", "rock-n-roll", "rockabilly", "romance", 
    "sad", "salsa", "samba", "sertanejo", "show-tunes", "singer-songwriter", "ska", "sleep", 
    "songwriter", "soul", "soundtracks", "spanish", "study", "summer", "swedish", "synth-pop", 
    "tango", "techno", "trance", "trip-hop", "turkish", "work-out", "world-music"
]

In [3]:
# Spotify API credentials
load_dotenv()
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')

In [4]:
# Spotify API authentication
client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id,
    client_secret=client_secret
)
sp = spotipy.Spotify(
    client_credentials_manager=client_credentials_manager
)

In [5]:
playlists = []

for genre in genres_seeds:
    # Search playlists by genre
    playlists_data = sp.search(q=genre, type='playlist', limit=50, offset=0)
    playlists += [
        get_playlist_info(playlist, genre)
        for playlist in playlists_data['playlists']['items']
    ]

df_playlists = pd.DataFrame(playlists)
print(f'Shape playlists', df_playlists.shape)

# Validation of playlists with null values
print(f'Null in id col of playlists', df_playlists["id"].isna().value_counts())

# Delete rows with null values
df_playlists.dropna(inplace=True)
print(f'Null in id col of playlists', df_playlists["id"].isna().value_counts())

df_playlists.head()

Shape playlists (6279, 5)
Null in id col of playlists id
False    6279
Name: count, dtype: int64
Null in id col of playlists id
False    6279
Name: count, dtype: int64


Unnamed: 0,id,name,tracks_href,main_image,genre
0,37i9dQZF1DWUH2AzNQzWua,Acoustic Hits: Oldies but Goodies,https://api.spotify.com/v1/playlists/37i9dQZF1...,https://i.scdn.co/image/ab67706f0000000360689d...,acoustic
1,37i9dQZF1DWXmlLSKkfdAk,Acoustic Covers,https://api.spotify.com/v1/playlists/37i9dQZF1...,https://i.scdn.co/image/ab67706f00000003bd1b35...,acoustic
2,37i9dQZF1EIdu0PHOCYQ71,Acoustic Mix,https://api.spotify.com/v1/playlists/37i9dQZF1...,https://seed-mix-image.spotifycdn.com/v6/img/d...,acoustic
3,37i9dQZF1DWWxrt1tiKYiX,Acoustic Pop,https://api.spotify.com/v1/playlists/37i9dQZF1...,https://i.scdn.co/image/ab67706f0000000328ec47...,acoustic
4,4Xv7w5RBLUz71sSzIs4C6b,Acoustic Covers of Popular Songs,https://api.spotify.com/v1/playlists/4Xv7w5RBL...,https://i.scdn.co/image/ab67706c0000bebb5a4a94...,acoustic


In [6]:
# Get tracks from playlists
tracks_playlists = get_tracks_from_playlists(playlists=df_playlists, sp=sp)
df_tracks = pd.DataFrame(tracks_playlists)
print('Shape tracks:', df_tracks.shape)

# Validation of tracks with null values
print('Null in id col of tracks:', df_tracks["id"].isna().value_counts())

# Delete rows with null values
df_tracks.dropna(inplace=True)
print('Null in id col of tracks:', df_tracks["id"].isna().value_counts())

# Delete duplicates by id
df_tracks.drop_duplicates(subset=["id"], inplace=True)
print('Shape tracks (without duplicates):', df_tracks.shape)

100%|██████████| 6279/6279 [48:26<00:00,  2.16it/s]  


Shape tracks: (453593, 5)
Null in id col of tracks: id
False    453421
True        172
Name: count, dtype: int64
Null in id col of tracks: id
False    453421
Name: count, dtype: int64
Shape tracks (without duplicates): (276579, 5)


In [8]:
# Save df_tracks to csv
df_tracks.to_csv('./datasets/tracks.csv', index=False)

In [11]:
# Get audio features from tracks
audio_features = []

for n in tqdm(range(len(df_tracks) // 100 + 1)):
    df_track_pack = df_tracks.iloc[n*100:(n+1)*100]
    audio_features_pack = sp.audio_features(tracks=list(df_track_pack['id']))
    audio_features += audio_features_pack

df_audio_features = pd.DataFrame(audio_features)
print('Shape audio features:', df_audio_features.shape)

100%|██████████| 2766/2766 [19:46<00:00,  2.33it/s]


AttributeError: 'NoneType' object has no attribute 'keys'

In [24]:
idx_af_bads = []
for idx, af in enumerate(audio_features):
    if af is None:
        idx_af_bads.append(idx)

print('Number of None values in audio features:', len(idx_af_bads))

Number of None values in audio features: 156


In [26]:
# Delete None values in audio_features before convert to DataFrame
audio_features = [af for af in audio_features if af is not None]
df_audio_features = pd.DataFrame(audio_features)
print('Shape audio features (without None values):', df_audio_features.shape)
df_audio_features.head()

Shape audio features (without None values): (276423, 18)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.429,0.661,11,-7.227,1,0.0281,0.00239,0.000121,0.234,0.285,173.372,audio_features,3AJwUDP919kvQ9QcozQPxg,spotify:track:3AJwUDP919kvQ9QcozQPxg,https://api.spotify.com/v1/tracks/3AJwUDP919kv...,https://api.spotify.com/v1/audio-analysis/3AJw...,266773,4
1,0.315,0.715,11,-8.072,0,0.0362,0.00125,8e-06,0.0942,0.497,155.925,audio_features,6Qyc6fS4DsZjB2mRW9DsQs,spotify:track:6Qyc6fS4DsZjB2mRW9DsQs,https://api.spotify.com/v1/tracks/6Qyc6fS4DsZj...,https://api.spotify.com/v1/audio-analysis/6Qyc...,289533,3
2,0.559,0.345,4,-13.496,1,0.0459,0.0576,0.000105,0.141,0.458,84.581,audio_features,3d9DChrdc6BOeFsbrZ3Is0,spotify:track:3d9DChrdc6BOeFsbrZ3Is0,https://api.spotify.com/v1/tracks/3d9DChrdc6BO...,https://api.spotify.com/v1/audio-analysis/3d9D...,264307,4
3,0.472,0.671,4,-4.649,1,0.029,0.0129,0.0,0.159,0.0681,82.952,audio_features,77loZpT5Y5PRP1S451P9Yz,spotify:track:77loZpT5Y5PRP1S451P9Yz,https://api.spotify.com/v1/tracks/77loZpT5Y5PR...,https://api.spotify.com/v1/audio-analysis/77lo...,232800,4
4,0.56,0.595,9,-4.53,1,0.0274,0.199,6.5e-05,0.132,0.144,104.011,audio_features,5hnyJvgoWiQUYZttV4wXy6,spotify:track:5hnyJvgoWiQUYZttV4wXy6,https://api.spotify.com/v1/tracks/5hnyJvgoWiQU...,https://api.spotify.com/v1/audio-analysis/5hny...,267960,4


In [27]:
# Save df_audio_features to csv
df_audio_features.to_csv('./datasets/audio_features.csv', index=False)

In [28]:
# Merge dataframes df_tracks and df_audio_features
df_tracks_audio_features = pd.merge(
    df_tracks,
    df_audio_features,
    on="id",
    how="inner"
)
print(f'Shape audio features (after merge with tracks):', df_tracks_audio_features.shape)
df_tracks_audio_features.head()

Shape audio features (after merge with tracks): (276423, 22)


Unnamed: 0,id,name,track_href_x,album_name,album_id,danceability,energy,key,loudness,mode,...,instrumentalness,liveness,valence,tempo,type,uri,track_href_y,analysis_url,duration_ms,time_signature
0,3AJwUDP919kvQ9QcozQPxg,Yellow,https://api.spotify.com/v1/tracks/3AJwUDP919kv...,Parachutes,6ZG5lRT77aJ3btmArcykra,0.429,0.661,11,-7.227,1,...,0.000121,0.234,0.285,173.372,audio_features,spotify:track:3AJwUDP919kvQ9QcozQPxg,https://api.spotify.com/v1/tracks/3AJwUDP919kv...,https://api.spotify.com/v1/audio-analysis/3AJw...,266773,4
1,6Qyc6fS4DsZjB2mRW9DsQs,Iris,https://api.spotify.com/v1/tracks/6Qyc6fS4DsZj...,Dizzy up the Girl,4UMjBXcRqIgMZ1XumU2x5T,0.315,0.715,11,-8.072,0,...,8e-06,0.0942,0.497,155.925,audio_features,spotify:track:6Qyc6fS4DsZjB2mRW9DsQs,https://api.spotify.com/v1/tracks/6Qyc6fS4DsZj...,https://api.spotify.com/v1/audio-analysis/6Qyc...,289533,3
2,3d9DChrdc6BOeFsbrZ3Is0,Under the Bridge,https://api.spotify.com/v1/tracks/3d9DChrdc6BO...,Blood Sugar Sex Magik (Deluxe Edition),30Perjew8HyGkdSmqguYyg,0.559,0.345,4,-13.496,1,...,0.000105,0.141,0.458,84.581,audio_features,spotify:track:3d9DChrdc6BOeFsbrZ3Is0,https://api.spotify.com/v1/tracks/3d9DChrdc6BO...,https://api.spotify.com/v1/audio-analysis/3d9D...,264307,4
3,77loZpT5Y5PRP1S451P9Yz,The Reason,https://api.spotify.com/v1/tracks/77loZpT5Y5PR...,The Reason (15th Anniversary Deluxe),2zE1YKY7Okj10Tjl09jjth,0.472,0.671,4,-4.649,1,...,0.0,0.159,0.0681,82.952,audio_features,spotify:track:77loZpT5Y5PRP1S451P9Yz,https://api.spotify.com/v1/tracks/77loZpT5Y5PR...,https://api.spotify.com/v1/audio-analysis/77lo...,232800,4
4,5hnyJvgoWiQUYZttV4wXy6,Chasing Cars,https://api.spotify.com/v1/tracks/5hnyJvgoWiQU...,Eyes Open,3k7bXPw2u0C0SBKPMsgMS3,0.56,0.595,9,-4.53,1,...,6.5e-05,0.132,0.144,104.011,audio_features,spotify:track:5hnyJvgoWiQUYZttV4wXy6,https://api.spotify.com/v1/tracks/5hnyJvgoWiQU...,https://api.spotify.com/v1/audio-analysis/5hny...,267960,4


In [30]:
# Drop track_href_x column and rename track_href_y to track_href
df_tracks_audio_features.drop(columns=["track_href_x"], inplace=True)
df_tracks_audio_features.rename(columns={"track_href_y": "track_href"}, inplace=True)

In [31]:
# Save df_tracks_audio_features to csv
df_tracks_audio_features.to_csv('./datasets/tracks_audio_features.csv', index=False)