In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from xgboost import plot_importance
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# 1. Read in all source data

In [7]:
# 1. All Billboard Top 100 songs from 1958-2024 w/o the necessary song features *source of truth*
data_hits = pd.read_csv('../data/raw/charts_billboard_1958_2024.csv')

# 2. Billboard hits of unspecified data range w/ all features, missing year
df_hits = pd.read_csv('../data/raw/hot_100_with_audio_features.csv')

# 3. Dataset of ~1.2 million songs from 2000-2023 w/ all features
data_1m = pd.read_csv('../data/raw/spotify_data.csv')

# 4. Dataset of 1.2 mil tracks w/ all features
df1 = pd.read_csv('../data/raw/spotify_1million.csv')

# 2. Data Preparation

## Preliminary clean-up of datasets (column formatting, filtering, etc)

### 1) Cleaning up Billboard Top 100 dataset (source of truth) to format year and limit to relevant columns

In [8]:
data_hits.head(1)

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL
0,8/6/1958,Poor Little Fool,Ricky Nelson,1,1,1,2,#


In [9]:
# Add a column for the year instead of full release date to align with other datasets
data_hits['year'] = pd.to_datetime(data_hits['Date'], errors='coerce', format='%m/%d/%Y').dt.year
data_hits.head(1)

Unnamed: 0,Date,Song,Artist,Rank,Last Week,Peak Position,Weeks in Charts,Image URL,year
0,8/6/1958,Poor Little Fool,Ricky Nelson,1,1,1,2,#,1958


In [10]:
# Extract the columns we need to identify a song
data_hits = data_hits[['Song', 'Artist', 'year']]

### 2) Cleaning up billboard hits dataset that contains audio features and selecting columns

In [11]:
df_hits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29503 entries, 0 to 29502
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   SongID                     29503 non-null  object 
 1   Performer                  29503 non-null  object 
 2   Song                       29503 non-null  object 
 3   spotify_genre              27903 non-null  object 
 4   spotify_track_id           24397 non-null  object 
 5   spotify_track_preview_url  14491 non-null  object 
 6   duration_ms                24397 non-null  float64
 7   explicit                   24397 non-null  object 
 8   album                      24391 non-null  object 
 9   danceability               24334 non-null  float64
 10  energy                     24334 non-null  float64
 11  key                        24334 non-null  float64
 12  loudness                   24334 non-null  float64
 13  mode                       24334 non-null  flo

In [12]:
df_hits.columns

Index(['SongID', 'Performer', 'Song', 'spotify_genre', 'spotify_track_id',
       'spotify_track_preview_url', 'duration_ms', 'explicit', 'album',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature', 'spotify_track_popularity'],
      dtype='object')

In [13]:
# Extract the features we need
df_hits = df_hits[['Performer', 'Song', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']]

In [14]:
df_hits = df_hits.dropna()

In [15]:
df_hits.head(1)

Unnamed: 0,Performer,Song,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
2,Andy Williams,......And Roses And Roses,166106.0,0.154,0.185,5.0,-14.063,1.0,0.0315,0.911,0.000267,0.112,0.15,83.969,4.0


In [16]:
len(df_hits)

24330

### 3) Checking the first 1.2 million songs dataset, no upfront manipulation is necessary (Year 2000-2023)

In [17]:
data_1m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1159764 entries, 0 to 1159763
Data columns (total 20 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1159764 non-null  int64  
 1   artist_name       1159749 non-null  object 
 2   track_name        1159763 non-null  object 
 3   track_id          1159764 non-null  object 
 4   popularity        1159764 non-null  int64  
 5   year              1159764 non-null  int64  
 6   genre             1159764 non-null  object 
 7   danceability      1159764 non-null  float64
 8   energy            1159764 non-null  float64
 9   key               1159764 non-null  int64  
 10  loudness          1159764 non-null  float64
 11  mode              1159764 non-null  int64  
 12  speechiness       1159764 non-null  float64
 13  acousticness      1159764 non-null  float64
 14  instrumentalness  1159764 non-null  float64
 15  liveness          1159764 non-null  float64
 16  

In [18]:
data_1m.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3


In [19]:
data_1m.describe().loc[['min', 'max']]

Unnamed: 0.1,Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
min,0.0,0.0,2000.0,0.0,0.0,0.0,-58.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2073.0,0.0
max,1473395.0,100.0,2023.0,0.993,1.0,11.0,6.172,1.0,0.971,0.996,1.0,1.0,1.0,249.993,6000495.0,5.0


### 4) Cleaning up artists column in another 1.2 million track dataset to remove random characters (Year 1900-2020)

In [20]:
df1.describe()

Unnamed: 0,track_number,disc_number,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
count,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0,1204025.0
mean,7.656352,1.055906,0.4930565,0.5095363,5.194151,-11.8087,0.6714595,0.08438219,0.4467511,0.2828605,0.2015994,0.4279866,117.6344,248839.9,3.832494,2007.328
std,5.994977,0.2953752,0.1896694,0.2946839,3.536731,6.982132,0.4696827,0.1159914,0.3852014,0.3762844,0.1804591,0.2704846,30.93705,162210.4,0.5611826,12.10117
min,1.0,1.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0
25%,3.0,1.0,0.356,0.252,2.0,-15.254,0.0,0.0351,0.0376,7.6e-06,0.0968,0.191,94.054,174090.0,4.0,2002.0
50%,7.0,1.0,0.501,0.524,5.0,-9.791,1.0,0.0446,0.389,0.00808,0.125,0.403,116.726,224339.0,4.0,2009.0
75%,10.0,1.0,0.633,0.766,8.0,-6.717,1.0,0.0723,0.861,0.719,0.245,0.644,137.046,285840.0,4.0,2015.0
max,50.0,13.0,1.0,1.0,11.0,7.234,1.0,0.969,0.996,1.0,1.0,1.0,248.934,6061090.0,5.0,2020.0


In [21]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204025 entries, 0 to 1204024
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   id                1204025 non-null  object 
 1   name              1204022 non-null  object 
 2   album             1204014 non-null  object 
 3   album_id          1204025 non-null  object 
 4   artists           1204025 non-null  object 
 5   artist_ids        1204025 non-null  object 
 6   track_number      1204025 non-null  int64  
 7   disc_number       1204025 non-null  int64  
 8   explicit          1204025 non-null  bool   
 9   danceability      1204025 non-null  float64
 10  energy            1204025 non-null  float64
 11  key               1204025 non-null  int64  
 12  loudness          1204025 non-null  float64
 13  mode              1204025 non-null  int64  
 14  speechiness       1204025 non-null  float64
 15  acousticness      1204025 non-null  float64
 16  

##### Correcting values and removing uneccessary characters

In [22]:
# Finding tracks where year is returning zero
df1[df1['year'] == 0]

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
815351,035h5flqzwF6I5CTfsdHPA,Jimmy Neutron,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],1,1,False,0.795,...,0.0519,0.0156,0.439,0.086,0.389,109.985,183000,4.0,0,0
815352,49x05fLGDKCsCUA7CG0VpY,I Luv You,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],2,1,False,0.762,...,0.095,0.887,0.909,0.106,0.728,92.962,145161,4.0,0,0
815353,4mNLlSoZOqoPauBAF3bIpx,My Heart,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],3,1,False,0.671,...,0.0662,0.00956,0.902,0.0455,0.893,97.865,176561,4.0,0,0
815354,7w5iwI0wnIiopbCFNe1Txo,I Am (Invincible),Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],4,1,False,0.759,...,0.128,0.00544,0.895,0.0538,0.537,89.989,192000,4.0,0,0
815355,2Tfy2R2uiWVwxHQUT6oGNp,Flower Power,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],5,1,False,0.657,...,0.281,0.018,0.245,0.241,0.964,179.904,138666,4.0,0,0
815356,05cTbSPQyha6z7opYwH67O,Heard It Low,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],6,1,False,0.728,...,0.0673,0.00785,0.275,0.0865,0.662,90.01,138667,4.0,0,0
815357,1fYK5xB8csOXVEqApkzzm0,Hangin On,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],7,1,False,0.822,...,0.0758,0.115,0.881,0.121,0.766,119.998,142620,4.0,0,0
815358,4G51c7cWzB6CLaRq9sYj2w,God Loves You,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],8,1,False,0.845,...,0.0662,0.00274,0.548,0.0393,0.472,120.09,161000,4.0,0,0
815359,45fcUAjXlzDxTwSzoUaO6l,You In My Life,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],9,1,False,0.957,...,0.0623,0.133,0.857,0.0968,0.258,112.987,214867,4.0,0,0
815360,35TcKSN5hsGcZLrFPkUvIv,I Wonder,Optimism 2,211vSdhxt58A943r9QWRKo,['iCizzle'],['7arv4matK2uKJrdtPSxU4i'],10,1,False,0.659,...,0.0581,0.00196,0.854,0.371,0.877,146.02,180822,4.0,0,0


In [23]:
# Only one album is returning year = 0, confirmed from Spotify that Optimism 2 by Icizzle was released 2018 so adding that back in
df1.loc[(df1['year'] == 0) & (df1['album'] == 'Optimism 2') & (df1['artists'].str.contains('iCizzle')), 'year'] = 2018

In [24]:
# The artists column has special characters. Removed those.
df1['artists'] = df1['artists'].str.replace(r"[\[\]()']", '', regex=True)

In [25]:
df1.describe().loc[['min', 'max']]

Unnamed: 0,track_number,disc_number,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year
min,1.0,1.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,0.0,1900.0
max,50.0,13.0,1.0,1.0,11.0,7.234,1.0,0.969,0.996,1.0,1.0,1.0,248.934,6061090.0,5.0,2020.0


# 3. Dataset Merging

## Normalizing song + artist columns to create a unique common "song_artist" for all four datasets

In [26]:
# Rename the columns to song and artist
data_hits = data_hits.copy()
data_hits.rename(columns={'Song': 'song', 'Artist': 'artist'}, inplace=True)

data_1m = data_1m.copy()
data_1m.rename(columns={'track_name': 'song', 'artist_name': 'artist'}, inplace=True)

df1 = df1.copy()
df1.rename(columns={'name': 'song', 'artists': 'artist'}, inplace=True)

df_hits = df_hits.copy()
df_hits.rename(columns={'Song':'song', 'Performer':'artist'}, inplace=True)

In [27]:
# Remove any spaces and normalize potential capitalization discrepancies
data_hits['song'] = data_hits['song'].str.lower().str.strip()
data_hits['artist'] = data_hits['artist'].str.lower().str.strip()

data_1m['song'] = data_1m['song'].str.lower().str.strip()
data_1m['artist'] = data_1m['artist'].str.lower().str.strip()

df1['song'] = df1['song'].str.lower().str.strip()
df1['artist'] = df1['artist'].str.lower().str.strip()

df_hits['song'] = df_hits['song'].str.lower().str.strip()
df_hits['artist'] = df_hits['artist'].str.lower().str.strip()

In [28]:
# Create a new column "song_artist" in all four datasets
data_hits['song_artist'] = data_hits['song'] + "_" + data_hits['artist']
data_1m['song_artist'] = data_1m['song'] + "_" + data_1m['artist']
df1['song_artist'] = df1['song'] + "_" + df1['artist']
df_hits['song_artist'] = df_hits['song'] + "_" + df_hits['artist']

## Removing duplicate song_artist pairings from all four datasets

In [29]:
#tracks can be chart toppers many weeks in a row, this removes duplicate songs from the dataset
data_hits_unique = data_hits.drop_duplicates(subset=['song','artist'])
print("total billboard top 100 1958-2024 unique songs:", len(data_hits_unique))

# find if there are any duplicate tracks in this dataset and remove them if they exist
data_1m_unique = data_1m.drop_duplicates(subset=['song','artist'])
print("total 1.2 million track dataset unique songs:", len(data_1m_unique))

# find if there are any duplicate tracks in this dataset and remove them if they exist
df1_unique = df1.drop_duplicates(subset=['song','artist'])
print("total 1 million track dataset unique songs:", len(df1_unique))

#tracks can be chart toppers many weeks in a row, this removes duplicate songs from the dataset
df_hits_unique = df_hits.drop_duplicates(subset=['song','artist'])
print("total hot 100 unique songs:", len(df_hits_unique))

total billboard top 100 1958-2024 unique songs: 31005
total 1.2 million track dataset unique songs: 1151896
total 1 million track dataset unique songs: 1139057
total hot 100 unique songs: 24219


## Combine the two datasets of million songs on common features and remove duplicate song_artist values

In [30]:
df1_unique.columns.sort_values()

Index(['acousticness', 'album', 'album_id', 'artist', 'artist_ids',
       'danceability', 'disc_number', 'duration_ms', 'energy', 'explicit',
       'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'release_date', 'song', 'song_artist', 'speechiness', 'tempo',
       'time_signature', 'track_number', 'valence', 'year'],
      dtype='object')

In [31]:
data_1m_unique.columns.sort_values()

Index(['Unnamed: 0', 'acousticness', 'artist', 'danceability', 'duration_ms',
       'energy', 'genre', 'instrumentalness', 'key', 'liveness', 'loudness',
       'mode', 'popularity', 'song', 'song_artist', 'speechiness', 'tempo',
       'time_signature', 'track_id', 'valence', 'year'],
      dtype='object')

In [32]:
conserved_columns = list(df1_unique.columns.intersection(data_1m_unique.columns))
conserved_columns

['song',
 'artist',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms',
 'time_signature',
 'year',
 'song_artist']

In [33]:
# Consolidate the two datasets of million songs
df_2m = pd.merge(df1_unique[conserved_columns], data_1m_unique[conserved_columns], on=conserved_columns, how='outer')

In [34]:
df_2m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2271907 entries, 0 to 2271906
Data columns (total 17 columns):
 #   Column            Dtype  
---  ------            -----  
 0   song              object 
 1   artist            object 
 2   danceability      float64
 3   energy            float64
 4   key               int64  
 5   loudness          float64
 6   mode              int64  
 7   speechiness       float64
 8   acousticness      float64
 9   instrumentalness  float64
 10  liveness          float64
 11  valence           float64
 12  tempo             float64
 13  duration_ms       int64  
 14  time_signature    float64
 15  year              int64  
 16  song_artist       object 
dtypes: float64(10), int64(4), object(3)
memory usage: 294.7+ MB


In [35]:
# Drop duplicate song_artist pairs in the consolidated dataset
df_2m_unique = df_2m.drop_duplicates(subset=['song_artist'], keep='first')
len(df_2m_unique)

2202814

## Consolidate the two lists of Billboard Top 100 hit songs

In [36]:
common_columns = ['song', 'artist', 'song_artist']
top_hits_all = pd.merge(data_hits_unique, df_hits_unique, on=common_columns, how='left')

In [37]:
top_hits_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31005 entries, 0 to 31004
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song              31005 non-null  object 
 1   artist            31005 non-null  object 
 2   year              31005 non-null  int32  
 3   song_artist       31005 non-null  object 
 4   duration_ms       23256 non-null  float64
 5   danceability      23256 non-null  float64
 6   energy            23256 non-null  float64
 7   key               23256 non-null  float64
 8   loudness          23256 non-null  float64
 9   mode              23256 non-null  float64
 10  speechiness       23256 non-null  float64
 11  acousticness      23256 non-null  float64
 12  instrumentalness  23256 non-null  float64
 13  liveness          23256 non-null  float64
 14  valence           23256 non-null  float64
 15  tempo             23256 non-null  float64
 16  time_signature    23256 non-null  float6

## Find overlapping songs between the 2.2 million songs dataset and the consolidated list of Billboard hit songs (A dataset of Billboard hit songs w/ features)

In [38]:
# Find where song_artist pairs from Billboard Top 100 overlap with the 2.2 million songs dataset
overlap_tracks = df_2m_unique[df_2m_unique['song_artist'].isin(top_hits_all['song_artist'])]

In [39]:
overlap_tracks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11545 entries, 1 to 2269074
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song              11545 non-null  object 
 1   artist            11545 non-null  object 
 2   danceability      11545 non-null  float64
 3   energy            11545 non-null  float64
 4   key               11545 non-null  int64  
 5   loudness          11545 non-null  float64
 6   mode              11545 non-null  int64  
 7   speechiness       11545 non-null  float64
 8   acousticness      11545 non-null  float64
 9   instrumentalness  11545 non-null  float64
 10  liveness          11545 non-null  float64
 11  valence           11545 non-null  float64
 12  tempo             11545 non-null  float64
 13  duration_ms       11545 non-null  int64  
 14  time_signature    11545 non-null  float64
 15  year              11545 non-null  int64  
 16  song_artist       11545 non-null  object 
d

In [40]:
overlap_tracks.sort_values(by='song_artist')

Unnamed: 0,song,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,song_artist
1843264,#1,nelly,0.690,0.592,7,-5.973,1,0.3000,0.02360,0.000000,0.5600,0.466,179.980,198760,4.0,2002,#1_nelly
1256776,#selfie,the chainsmokers,0.789,0.916,0,-3.262,1,0.2490,0.01350,0.000008,0.0770,0.658,127.956,183750,4.0,2014,#selfie_the chainsmokers
654479,'65 love affair,paul davis,0.617,0.691,2,-3.643,0,0.0268,0.35100,0.000001,0.0831,0.915,156.109,219827,4.0,2008,'65 love affair_paul davis
584385,'til i can make it on my own,tammy wynette,0.462,0.300,7,-11.937,1,0.0300,0.77400,0.000004,0.1180,0.151,140.767,181800,4.0,1972,'til i can make it on my own_tammy wynette
2064559,'til my baby comes home,luther vandross,0.813,0.665,11,-7.887,0,0.1630,0.04000,0.000011,0.0276,0.852,139.556,332013,4.0,2007,'til my baby comes home_luther vandross
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1592114,yummy,justin bieber,0.676,0.506,9,-6.652,0,0.0958,0.34500,0.000000,0.1180,0.497,145.842,208520,4.0,2020,yummy_justin bieber
1333542,zero,chris brown,0.731,0.818,1,-4.564,0,0.0638,0.05170,0.000000,0.0743,0.812,120.993,214600,4.0,2015,zero_chris brown
222777,zip code,the five americans,0.400,0.446,9,-8.624,1,0.0276,0.02830,0.000269,0.0653,0.963,143.408,152827,4.0,2003,zip code_the five americans
683131,zombie,bad wolves,0.448,0.826,2,-3.244,0,0.0319,0.00756,0.000000,0.1170,0.190,77.093,254805,4.0,2018,zombie_bad wolves


In [41]:
# identify columns in common to merge dataframes on
common_columns = list(top_hits_all.columns.intersection(overlap_tracks.columns))
common_columns

['song',
 'artist',
 'year',
 'song_artist',
 'duration_ms',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature']

In [42]:
# merge top hits dataset + song features with overlapping top tracks from 2.3 mil dataset
# this will maximize the number of hit songs from our source of truth that will have datapoints for song features

top_hits_features = pd.merge(overlap_tracks, top_hits_all, on=common_columns, how='outer')

# drop duplicate columns, duplicate song/artists and songs without feature data available
top_hits_features = top_hits_features.drop_duplicates(subset = 'song_artist', keep='first')
top_hits_features = top_hits_features [[col for col in top_hits_features.columns if not col.endswith('_y')]]

top_hits_features = top_hits_features.dropna()

## Add binary 0/1 column to the overlapping Billboard hit songs to identify these as hit songs (1 = hit song, which all of these are)

In [43]:
top_hits_features.loc[:, 'hit_song'] = 1

In [44]:
top_hits_features.head(1)

Unnamed: 0,song,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,song_artist,hit_song
0,guerrilla radio,rage against the machine,0.599,0.957,11.0,-5.764,1.0,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200.0,4.0,1999,guerrilla radio_rage against the machine,1


In [45]:
top_hits_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23925 entries, 0 to 39981
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   song              23925 non-null  object 
 1   artist            23925 non-null  object 
 2   danceability      23925 non-null  float64
 3   energy            23925 non-null  float64
 4   key               23925 non-null  float64
 5   loudness          23925 non-null  float64
 6   mode              23925 non-null  float64
 7   speechiness       23925 non-null  float64
 8   acousticness      23925 non-null  float64
 9   instrumentalness  23925 non-null  float64
 10  liveness          23925 non-null  float64
 11  valence           23925 non-null  float64
 12  tempo             23925 non-null  float64
 13  duration_ms       23925 non-null  float64
 14  time_signature    23925 non-null  float64
 15  year              23925 non-null  int64  
 16  song_artist       23925 non-null  object 
 17

## Identify a dataset of songs that have not been on Billboard Top 100 by removing the Billboard hit songs from the 2.2 million songs dataset

In [46]:
len(df_2m_unique)

2202814

In [47]:
non_hit_tracks = df_2m_unique[~df_2m_unique['song_artist'].isin(top_hits_features['song_artist'])]
len(non_hit_tracks)

2191269

In [48]:
#confirm that the difference between the original dataframe and the filtered one is only the # of overlapping tracks
len(df_2m_unique) - len(non_hit_tracks)

11545

## Add binary 0/1 column to the non-hit songs (0 = not a hit song)

In [49]:
non_hit_tracks.loc[:, 'hit_song'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_hit_tracks.loc[:, 'hit_song'] = 0


In [50]:
non_hit_tracks.head(1)

Unnamed: 0,song,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,song_artist,hit_song
0,testify,rage against the machine,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,testify_rage against the machine,0


## Creation of intermediate datasets:
- top_hits_features: merged dataset of hit songs list with audio feature values
- non_hit_tracks: merged dataset of all non-hit songs (any hit songs present in original datasets removed)

In [51]:
top_hits_features.head()

Unnamed: 0,song,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,song_artist,hit_song
0,guerrilla radio,rage against the machine,0.599,0.957,11.0,-5.764,1.0,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200.0,4.0,1999,guerrilla radio_rage against the machine,1
1,leave right now,will young,0.641,0.445,6.0,-8.674,1.0,0.0368,0.145,0.0,0.108,0.383,81.931,214733.0,4.0,2003,leave right now_will young,1
2,let the day begin,the call,0.428,0.867,2.0,-5.051,1.0,0.0818,0.0241,0.0253,0.715,0.383,124.154,265744.0,4.0,2000,let the day begin_the call,1
3,misery,soul asylum,0.38,0.643,0.0,-8.304,1.0,0.0287,4.9e-05,2.7e-05,0.244,0.297,113.674,264360.0,4.0,1995,misery_soul asylum,1
4,promises broken,soul asylum,0.472,0.475,4.0,-11.115,1.0,0.0311,0.138,3e-06,0.113,0.718,150.611,194587.0,4.0,1995,promises broken_soul asylum,1


In [53]:
top_hits_features.to_csv('../data/processed/top_hits_features.csv', index=False)

In [54]:
non_hit_tracks.to_csv('../data/processed/non_hit_tracks.csv', index=False)