**Data Preprocessing & Cleaning**

Primary:
- id (Id of track generated by Spotify)

Numerical:
- acousticness (Ranges from 0 to 1)
    - whether the song is acoustic or not, 0(not acoustic)->1(very acoustic)
- danceability (Ranges from 0 to 1)
    - how suitable the track is for dancing, 0(not danceable)->1(very danceable)
- energy (Ranges from 0 to 1)
    - how energetic the track is, 0(less energetic)->1(very energetic)
- duration_ms (Integer typically ranging from 200k to 300k)
    - Time in MS
- instrumentalness (Ranges from 0 to 1)
    - the ratio of instrumental sounds overall, 0(lot of vocal sounds)->1(instrument sounds)
- valence (Ranges from 0 to 1)
    - how positive the music is, 0(sad)->1(cheerful)
- popularity (Ranges from 0 to 100)
    - popularity of track
- tempo (Float typically ranging from 50 to 150)
    - tempo of track in BPM
- liveness (Ranges from 0 to 1)
    - presence of audience, 0(studio record)->1(concert)
- loudness (Float typically ranging from -60 to 0)
    - how loud the song is in dB -60(very quiet)->0(very loud)
- speechiness (Ranges from 0 to 1)
    - the ratio of spoken words to the overall, 0(instrumental)->1(talk show)

Dummy:
- mode 
    - (0 = Minor, 1 = Major)
- explicit 
    - (0 = No explicit content, 1 = Explicit content)

Categorical:
- key (All keys on octave encoded as values ranging from 0 to 11, starting on C as 0, C# as 1 and so on…)
    - the major key of the track, 0:C, 1:C#, 2:D, ..., 11:B
        0. Key of C
        1. Key of C#/Db (enharmonic keys)
        2. Key of D
        3. Key of D#/Eb
        4. Key of E
        5. Key of F
        6. Key of F#/Gb (enharmonic keys)
        7. Key of G
        8. Key of G#/Ab
        9. Key of A
        10. Key of A#/Bb
        11. Key of B
- timesignature 
    - (The predicted timesignature, most typically 4)

In [5]:
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
load_dotenv()
# import os
# import re

True

In [6]:
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

# sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=os.environ.get('SPOTIPY_CLIENT_ID'),
#                                                            client_secret=os.environ.get('SPOTIPY_CLIENT_SECRET')))


In [7]:
column_names = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 
                'track_href', 'analysis_url', 'duration_ms', 'time_signature']

In [8]:
def user_playlist_tracks_full(spotify_connection=sp, user=None, 
                              playlist_id=None, genre=None):

    # first run through also retrieves total no of songs in library
    response = spotify_connection.user_playlist_tracks(user, playlist_id, limit=100)
    results = response["items"]

    # subsequently runs until it hits the user-defined limit or has read all songs in the library
    while len(results) < response["total"]:
        response = spotify_connection.user_playlist_tracks(
            user, playlist_id, limit=100, offset=len(results)
        )
        results.extend(response["items"])
    
    playlist_song_ids = []
    for each in range(len(results)):
        playlist_song_ids.append((results[each]['track']['id']))
    #Remove possible nones
    playlist_song_ids = [i for i in playlist_song_ids if i]
    
    theDataFrame = pd.DataFrame(columns = column_names)
    while(len(playlist_song_ids)>0):
        theDataFrame = theDataFrame.append(sp.audio_features(playlist_song_ids[:100]),ignore_index=True)
        playlist_song_ids = playlist_song_ids[100:]
    
    theDataFrame['genre'] = genre

    return theDataFrame

In [9]:
alt_metal_songs = user_playlist_tracks_full(playlist_id = '40DeXsA9tEIwNwBmrZ4rkt', genre = 'alternative-metal')
len(alt_metal_songs)

1285

In [10]:
hiphop_songs = user_playlist_tracks_full(playlist_id = '13u9Bn677jEHePtS7XKmih', genre = 'hip-hop')
len(hiphop_songs)

2189

In [11]:
rock_songs = user_playlist_tracks_full(playlist_id = '1SY54UtMrIadoVThZsJShG', genre = 'rock')
len(rock_songs)

1084

In [12]:
pop_songs = user_playlist_tracks_full(playlist_id = '1szFiylNjSI99tpQgVZ3ki', genre = 'pop')
len(pop_songs)

710

In [13]:
all_songs = hiphop_songs.append([pop_songs, rock_songs, alt_metal_songs], ignore_index=True)

In [14]:
all_songs.drop(['type', 'uri', 'track_href', 'analysis_url'], axis = 1, inplace = True)

In [15]:
all_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5268 entries, 0 to 5267
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      5268 non-null   float64
 1   energy            5268 non-null   float64
 2   key               5268 non-null   object 
 3   loudness          5268 non-null   float64
 4   mode              5268 non-null   object 
 5   speechiness       5268 non-null   float64
 6   acousticness      5268 non-null   float64
 7   instrumentalness  5268 non-null   float64
 8   liveness          5268 non-null   float64
 9   valence           5268 non-null   float64
 10  tempo             5268 non-null   float64
 11  id                5268 non-null   object 
 12  duration_ms       5268 non-null   object 
 13  time_signature    5268 non-null   object 
 14  genre             5268 non-null   object 
dtypes: float64(9), object(6)
memory usage: 617.5+ KB


In [16]:
all_songs.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0
mean,0.570284,0.705044,-7.176929,0.145271,0.161369,0.093543,0.2443,0.463618,118.392675
std,0.180104,0.198816,3.371045,0.146549,0.232404,0.226143,0.214532,0.239102,30.073932
min,0.0,2e-05,-35.553,0.0,1e-06,0.0,0.0,0.0,0.0
25%,0.44075,0.583,-8.70725,0.0416,0.003108,0.0,0.102,0.277,93.98025
50%,0.571,0.733,-6.51,0.0787,0.04845,4.9e-05,0.154,0.449,115.0675
75%,0.711,0.863,-4.935,0.22,0.228,0.01635,0.321,0.647,139.914
max,0.966,0.999,-0.839,0.966,0.996,0.998,0.996,0.989,214.838


In [17]:
all_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre
0,0.507,0.985,0,-2.392,1,0.153,0.228,3.3e-05,0.19,0.394,150.059,7hQ0ojbeqicGhw0wdUVeaN,139560,4,hip-hop
1,0.625,0.726,10,-7.731,0,0.155,0.00728,0.0,0.318,0.483,103.98,6C7RJEIUDqKkJRZVWdkfkH,311867,4,hip-hop
2,0.552,0.846,9,-4.912,1,0.342,0.0114,0.0,0.309,0.554,98.56,2dxjKgT0li4qBI3QwuN9Ih,236600,1,hip-hop
3,0.685,0.631,7,-10.338,1,0.0926,0.0249,7.7e-05,0.211,0.72,104.126,4oNM6CscdoUZDHvzl18nbt,293493,4,hip-hop
4,0.64,0.864,4,-4.909,0,0.0869,0.0409,2e-06,0.136,0.497,105.107,6XGddj522FQRHXEhBtjiJu,272667,4,hip-hop


In [18]:
all_songs['duration_ms'] = all_songs['duration_ms'].astype(float)

In [19]:
all_songs.describe()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
count,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0
mean,0.570284,0.705044,-7.176929,0.145271,0.161369,0.093543,0.2443,0.463618,118.392675,241000.9
std,0.180104,0.198816,3.371045,0.146549,0.232404,0.226143,0.214532,0.239102,30.073932,85565.94
min,0.0,2e-05,-35.553,0.0,1e-06,0.0,0.0,0.0,0.0,7173.0
25%,0.44075,0.583,-8.70725,0.0416,0.003108,0.0,0.102,0.277,93.98025,198865.8
50%,0.571,0.733,-6.51,0.0787,0.04845,4.9e-05,0.154,0.449,115.0675,235666.0
75%,0.711,0.863,-4.935,0.22,0.228,0.01635,0.321,0.647,139.914,276911.0
max,0.966,0.999,-0.839,0.966,0.996,0.998,0.996,0.989,214.838,2238734.0


In [28]:
all_songs = all_songs.sample(frac=1).reset_index(drop = True)
all_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms,time_signature,genre
0,0.683,0.488,6,-9.473,0,0.135,0.408,0.0,0.132,0.469,75.039,2cx10hB95ygrUp2RsZW7Oh,296413.0,4,hip-hop
1,0.695,0.461,0,-10.731,1,0.0329,0.108,4e-05,0.0701,0.612,92.051,0DZkk6jRkexRCRYrkvC1D8,278093.0,5,rock
2,0.407,0.93,10,-3.288,0,0.0765,8.4e-05,0.00687,0.704,0.319,98.557,06yZCmbAstbzfuj9KF3bez,250253.0,4,alternative-metal
3,0.342,0.787,6,-9.086,1,0.272,0.0724,0.315,0.352,0.483,174.053,2Y10RJnQf7tGOlHowgRv6v,287902.0,5,rock
4,0.843,0.391,2,-7.899,1,0.0845,0.181,0.0,0.137,0.496,129.972,07rmSXN6vNoquX1AsWd9pP,160615.0,4,hip-hop
