In [16]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from Secrets import CLIENT_ID, CLIENT_SECRET, REDIRECT_URL
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [17]:
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=CLIENT_ID,
                                               client_secret=CLIENT_SECRET,
                                               redirect_uri=REDIRECT_URL,
                                               scope="user-library-read"))

df = pd.read_csv('./dataset.csv')
df = df[df.columns[1:]]
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


In [18]:
#6meRpNHvKC1VcBl5MIbVxo - hehe
#1Ob3QykC4dzf3rQwWxE9LV - study


playlist_id = '6meRpNHvKC1VcBl5MIbVxo'
playlist_length = sp.playlist_items(playlist_id)['total']

track_ids = []
for i in range(playlist_length // 100 + 1):
    playlist = sp.playlist_tracks(playlist_id, market = 'CAN',fields='items', limit=100, offset = i * 100 )
    for track in playlist['items']:
        if track['track'] is not None and track['track']['id'] is not None:
            track_ids.append(track['track']['id'])


In [19]:
def get_artist_genre(artist_id):
    artist = sp.artist(artist_id)
    if artist['genres']:
        return artist['genres'][0]
    return None


In [20]:
playlist_audio_data = []
artist_genres = {}
track_ids = set(track_ids)
df_track_ids = set(df['track_id'].values)

#Track ids that we need to add to the dataset
missing_track_ids = list(track_ids.difference(df_track_ids))

In [21]:
#Fetching missing track info from the Spotify API
missing_track_audio_features = []
count = 0
for i in range(0,len(missing_track_ids), 100):
    missing_track_audio_features.extend(sp.audio_features(tracks = missing_track_ids[i : i + 100]))

missing_track_info = []
for i in range(0, len(missing_track_ids), 50):
    missing_track_info.extend(sp.tracks(missing_track_ids[i : i + 50])['tracks'])

In [22]:
#Processing audio features and info for each track in the playlist
track_audio_features = [feature for feature in missing_track_audio_features]
track_info = [info for info in missing_track_info]
track_artist_ids = [info['artists'][0]['id'] for info in missing_track_info]

In [23]:
#Fetching artist Genres
for i in range(0, len(track_artist_ids), 50):
    results = sp.artists(track_artist_ids[i: i + 50])
    for artist in results['artists']:
        artist_genres[artist['name']] = artist_genres.get(artist['name'], artist['genres'][0] if artist['genres'] else None)

In [24]:
#Processing and adding the audio features to the dataset
for index ,id in enumerate(missing_track_ids):
    track_artists = [artist['name'] for artist in track_info[index]['artists']]
    main_artist_genre = artist_genres[track_artists[0]]
    audio_features_keys = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
    audio_data = {
        'track_id': id,
        'artists': ';'.join(track_artists),
        'album_name': track_info[index]['album']['name'],
        'track_name': track_info[index]['name'],
        'popularity': track_info[index]['popularity'],
        'duration_ms': track_info[index]['duration_ms'],
        'explicit': track_info[index]['explicit'],
        **{key: track_audio_features[index][key] for key in audio_features_keys},
        'track_genre': main_artist_genre
    }
    df.loc[len(df.index)] = audio_data

## Feature Engineering

In [25]:
scaler = MinMaxScaler()
features_to_scale = ['popularity','danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'time_signature']
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

In [26]:
#Binary Feature for is_major for mode (1 -> Major 0 -> Minor)
#Tempo Buckets (0 - 100, 100 - 125,125 - 150,150+) ✔
#Key Profiles OHE ✔

In [27]:
#OHE for Keys
OHE = pd.get_dummies(df['key'], drop_first=True, dtype=int, prefix="key")
df = df.drop(columns='key', axis = 'columns')
df = pd.concat([df, OHE], axis = 1)
print(df.head())

                 track_id                 artists                                         album_name                  track_name  popularity  duration_ms  explicit  danceability  energy  loudness  mode  speechiness  acousticness  instrumentalness  liveness   valence    tempo  time_signature track_genre  key_1  key_2  key_3  key_4  key_5  key_6  key_7  key_8  key_9  key_10  key_11
0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino                                             Comedy                      Comedy        0.73       230666     False      0.686294  0.4610  0.791392     0     0.148187      0.032329          0.000001    0.3580  0.718593   87.917             0.8    acoustic      1      0      0      0      0      0      0      0      0       0       0
1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward                                   Ghost (Acoustic)            Ghost - Acoustic        0.55       149610     False      0.426396  0.1660  0.597377     1     0.079067      0.927711      

In [28]:
#Creating buckets for tempo
bins = [0,100,125,150,500]
df['tempo'] = pd.cut(df['tempo'], bins, right = False, labels = ['Slow', 'Moderate','Fast','Very Fast'])
print(df.head())

                 track_id                 artists                                         album_name                  track_name  popularity  duration_ms  explicit  danceability  energy  loudness  mode  speechiness  acousticness  instrumentalness  liveness   valence      tempo  time_signature track_genre  key_1  key_2  key_3  key_4  key_5  key_6  key_7  key_8  key_9  key_10  key_11
0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino                                             Comedy                      Comedy        0.73       230666     False      0.686294  0.4610  0.791392     0     0.148187      0.032329          0.000001    0.3580  0.718593       Slow             0.8    acoustic      1      0      0      0      0      0      0      0      0       0       0
1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward                                   Ghost (Acoustic)            Ghost - Acoustic        0.55       149610     False      0.426396  0.1660  0.597377     1     0.079067      0.927711  

In [30]:
df['mode'] = df['mode'].map({1: 'major', 0: 'minor'})
print(df.head())

                 track_id                 artists                                         album_name                  track_name  popularity  duration_ms  explicit  danceability  energy  loudness   mode  speechiness  acousticness  instrumentalness  liveness   valence      tempo  time_signature track_genre  key_1  key_2  key_3  key_4  key_5  key_6  key_7  key_8  key_9  key_10  key_11
0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino                                             Comedy                      Comedy        0.73       230666     False      0.686294  0.4610  0.791392  minor     0.148187      0.032329          0.000001    0.3580  0.718593       Slow             0.8    acoustic      1      0      0      0      0      0      0      0      0       0       0
1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward                                   Ghost (Acoustic)            Ghost - Acoustic        0.55       149610     False      0.426396  0.1660  0.597377  major     0.079067      0.92771