In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the sales data
file_path = Path("Resources/spotify_songs.csv")
df_spotify = pd.read_csv(file_path)

# Display sample data
df_spotify.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,6/14/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),12/13/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),7/5/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,7/19/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),3/5/2019,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [3]:
# Identify all column names
df_spotify.columns

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')

In [4]:
# Redefine the dataframe with just columns needed for modeling
df_spotify_clean= df_spotify[['track_name', 'track_artist', 'track_album_release_date', 'playlist_genre',
       'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'track_popularity']]

df_spotify_clean.head()

Unnamed: 0,track_name,track_artist,track_album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_popularity
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,6/14/2019,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,66
1,Memories - Dillon Francis Remix,Maroon 5,12/13/2019,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,67
2,All the Time - Don Diablo Remix,Zara Larsson,7/5/2019,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,70
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,7/19/2019,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,60
4,Someone You Loved - Future Humans Remix,Lewis Capaldi,3/5/2019,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,69


In [5]:
# Define an empty list
banger_class = []

# Loop through the dataframe and categorize each song as a banger (1) or not (0)
# based on if it has a 'track_popularity' score greater than 65
for index, row in df_spotify_clean.iterrows():
    if row['track_popularity'] > 65:
        banger_class.append(1)
    else:
        banger_class.append(0)

banger_class[:15]

[1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0]

In [6]:
# Add the banger_class as a column in the dataframe
df_spotify_clean['bangers'] = banger_class
df_spotify_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_spotify_clean['bangers'] = banger_class


Unnamed: 0,track_name,track_artist,track_album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_popularity,bangers
0,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,6/14/2019,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,66,1
1,Memories - Dillon Francis Remix,Maroon 5,12/13/2019,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,67,1
2,All the Time - Don Diablo Remix,Zara Larsson,7/5/2019,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,70,1
3,Call You Mine - Keanu Silva Remix,The Chainsmokers,7/19/2019,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,60,0
4,Someone You Loved - Future Humans Remix,Lewis Capaldi,3/5/2019,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,69,1


In [7]:
# Set the index to track_name and track_artist
df_spotify_clean = df_spotify_clean.set_index(['track_name', 'track_artist'])
df_spotify_clean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,track_album_release_date,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,track_popularity,bangers
track_name,track_artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
I Don't Care (with Justin Bieber) - Loud Luxury Remix,Ed Sheeran,6/14/2019,pop,dance pop,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,66,1
Memories - Dillon Francis Remix,Maroon 5,12/13/2019,pop,dance pop,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,67,1
All the Time - Don Diablo Remix,Zara Larsson,7/5/2019,pop,dance pop,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,70,1
Call You Mine - Keanu Silva Remix,The Chainsmokers,7/19/2019,pop,dance pop,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,60,0
Someone You Loved - Future Humans Remix,Lewis Capaldi,3/5/2019,pop,dance pop,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,69,1


In [20]:
df_spotify_clean.columns

Index(['track_album_release_date', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'track_popularity', 'bangers'],
      dtype='object')

In [25]:
# The StandardScaler is not working because it doesn't know how to process the datetime.
# Trying to find a way to take this field out and put it back in, or find a way that allows the scaler to process it.


# Use StandardScaler() to standardize the dataset
# Creating StandardScaler instance
scaler = StandardScaler()

scale_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'track_popularity', 'bangers']

# Fitting the scaler to the data and transform
scaled_data = scaler.fit_transform(df_spotify_clean)


df_scaled = pd.DataFram(scaled_data)

scaled_df

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [12]:
# Check the data types of each column
df_spotify_clean.dtypes

track_album_release_date     object
playlist_genre               object
playlist_subgenre            object
danceability                float64
energy                      float64
key                           int64
loudness                    float64
mode                          int64
speechiness                 float64
acousticness                float64
instrumentalness            float64
liveness                    float64
valence                     float64
tempo                       float64
duration_ms                   int64
track_popularity              int64
bangers                       int64
dtype: object

In [13]:
# Change the 'track_album_release_date' column to a datetime
df_spotify_clean['track_album_release_date'] = pd.to_datetime(df_spotify_clean['track_album_release_date'])
df_spotify_clean.dtypes

track_album_release_date    datetime64[ns]
playlist_genre                      object
playlist_subgenre                   object
danceability                       float64
energy                             float64
key                                  int64
loudness                           float64
mode                                 int64
speechiness                        float64
acousticness                       float64
instrumentalness                   float64
liveness                           float64
valence                            float64
tempo                              float64
duration_ms                          int64
track_popularity                     int64
bangers                              int64
dtype: object

In [14]:
# Get dummies for each of the genre and sub-genres
df_dummies = pd.get_dummies(df_spotify_clean)
df_dummies.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,...,playlist_subgenre_new jack swing,playlist_subgenre_permanent wave,playlist_subgenre_pop edm,playlist_subgenre_post-teen pop,playlist_subgenre_progressive electro house,playlist_subgenre_reggaeton,playlist_subgenre_southern hip hop,playlist_subgenre_trap,playlist_subgenre_tropical,playlist_subgenre_urban contemporary
track_name,track_artist,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
I Don't Care (with Justin Bieber) - Loud Luxury Remix,Ed Sheeran,2019-06-14,0.748,0.916,6,-2.634,1,0.0583,0.102,0.0,0.0653,...,0,0,0,0,0,0,0,0,0,0
Memories - Dillon Francis Remix,Maroon 5,2019-12-13,0.726,0.815,11,-4.969,1,0.0373,0.0724,0.00421,0.357,...,0,0,0,0,0,0,0,0,0,0
All the Time - Don Diablo Remix,Zara Larsson,2019-07-05,0.675,0.931,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,...,0,0,0,0,0,0,0,0,0,0
Call You Mine - Keanu Silva Remix,The Chainsmokers,2019-07-19,0.718,0.93,7,-3.778,1,0.102,0.0287,9e-06,0.204,...,0,0,0,0,0,0,0,0,0,0
Someone You Loved - Future Humans Remix,Lewis Capaldi,2019-03-05,0.65,0.833,1,-4.672,1,0.0359,0.0803,0.0,0.0833,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_dummies.columns

Index(['track_album_release_date', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'track_popularity', 'bangers',
       'playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop',
       'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock',
       'playlist_subgenre_album rock', 'playlist_subgenre_big room',
       'playlist_subgenre_classic rock', 'playlist_subgenre_dance pop',
       'playlist_subgenre_electro house', 'playlist_subgenre_electropop',
       'playlist_subgenre_gangster rap', 'playlist_subgenre_hard rock',
       'playlist_subgenre_hip hop', 'playlist_subgenre_hip pop',
       'playlist_subgenre_indie poptimism', 'playlist_subgenre_latin hip hop',
       'playlist_subgenre_latin pop', 'playlist_subgenre_neo soul',
       'playlist_subgenre_new jack swing', 'playlist_subgenre_permanent wave',
       'playlist_subgenre_pop edm', 'playl

In [19]:
# Use StandardScaler() to standardize the dataset
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting the scaler to the data and transform
scaled_df = scaler.fit_transform(df_spotify_clean)

scaled_df.head()

KeyError: "['playlist_genre_edm', 'playlist_genre_latin', 'playlist_genre_pop', 'playlist_genre_r&b', 'playlist_genre_rap', 'playlist_genre_rock', 'playlist_subgenre_album rock', 'playlist_subgenre_big room', 'playlist_subgenre_classic rock', 'playlist_subgenre_dance pop', 'playlist_subgenre_electro house', 'playlist_subgenre_electropop', 'playlist_subgenre_gangster rap', 'playlist_subgenre_hard rock', 'playlist_subgenre_hip hop', 'playlist_subgenre_hip pop', 'playlist_subgenre_indie poptimism', 'playlist_subgenre_latin hip hop', 'playlist_subgenre_latin pop', 'playlist_subgenre_neo soul', 'playlist_subgenre_new jack swing', 'playlist_subgenre_permanent wave', 'playlist_subgenre_pop edm', 'playlist_subgenre_post-teen pop', 'playlist_subgenre_progressive electro house', 'playlist_subgenre_reggaeton', 'playlist_subgenre_southern hip hop', 'playlist_subgenre_trap', 'playlist_subgenre_tropical', 'playlist_subgenre_urban contemporary'] not in index"