# Getting current HOT-100 Data

In [1]:
import pandas as pd
import spotipy
import requests
import json
from bs4 import BeautifulSoup
import pickle

**First we had to scrape the titles of the songs and the artists off of billboard.**

In [2]:
hot_100_resp = requests.get("https://www.billboard.com/charts/hot-100")
hot_100_soup = BeautifulSoup(hot_100_resp.content, "html.parser")

In [3]:
details = hot_100_soup.find("div", class_="chart-details")
each_song = details.find("div", class_="chart-list chart-details__left-rail")
items = details.find_all("div", class_="chart-list-item")

In [4]:
titles = []
artists = []

for item in items:
    item.find("div", class_ = "chart-list-item chart-list-item--expanded")
    basic_info = item.find("div", class_="chart-list-item__first-row chart-list-item__cursor-pointer")
   
    
    #Extracting Title and Artist
    wrap = basic_info.find("div", class_ = "chart-list-item__text-wrapper")
    title = wrap.find("div", class_="chart-list-item__title").text.replace("\n", "")
    artist = wrap.find("div", class_="chart-list-item__artist").text.replace("\n", "")
    
    titles.append(title)
    artists.append(artist)

**Because of the format of the webpage, we was only able to get the top 2-100 using a `for` loop. So the top song we had to grab as an individual.**

In [5]:
#Getting #1
num1 = (hot_100_soup.find("body")
 .find("main")
 .find("div")
 .find("div", class_="container container--no-background chart-number-one")
 .find("div", class_="chart-number-one__info ")
 .find("div", class_="chart-number-one__details"))
titles = ([num1.find("div", class_="chart-number-one__title")
           .text] + titles)
artists = ([num1.find("div", class_="chart-number-one__artist")
            .text.replace("\n", "")]
           + artists)

**There was a space at the beginning of this artist's name, so we had to remove the gap.**

In [6]:
artists[3] = 'Post Malone & Swae Lee'

**Putting the `title` and `artist` into a data frame.**

In [7]:
top100_df = pd.DataFrame({"title": titles, "artist": artists})

In [8]:
top100_df.head()

Unnamed: 0,title,artist
0,Sucker,Jonas Brothers
1,7 Rings,Ariana Grande
2,Please Me,Cardi B & Bruno Mars
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee
4,Without Me,Halsey


In [9]:
from spotipy.oauth2 import SpotifyClientCredentials
#client_credentials_manager = SpotifyClientCredentials()

client_credentials_manager = SpotifyClientCredentials(client_id="1c34fcb7752e4114b252a73ab061bdb8",client_secret="afdd85ae9ca04482b2c5650bade56177")
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
from collections import OrderedDict

In [10]:
top100_df.head()

Unnamed: 0,title,artist
0,Sucker,Jonas Brothers
1,7 Rings,Ariana Grande
2,Please Me,Cardi B & Bruno Mars
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee
4,Without Me,Halsey


**To make it easier to extract the Spotify API IDs, we had to use the first artist if there was more than one artist who contributed to the song.**

In [11]:
simple_artists = []
for artist in top100_df.artist:
    artist1 = artist.replace("Featuring", "&")
    artist1 = artist1.replace("X", "&")
    #print(artist1)
    artist1 = artist1.split("&")[0]
    if(artist1[-1] == " "):
        begin = artist1
        artist1 = artist1[:-1]
        #print(begin, len(artist1))
    simple_artists.append(artist1)
top100_df["simple_artist"] = pd.Series(simple_artists)

In [12]:
top100_df.head()

Unnamed: 0,title,artist,simple_artist
0,Sucker,Jonas Brothers,Jonas Brothers
1,7 Rings,Ariana Grande,Ariana Grande
2,Please Me,Cardi B & Bruno Mars,Cardi B
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee,Post Malone
4,Without Me,Halsey,Halsey


In [13]:
top100_df["simple_artist"] = pd.Series(top100_df.simple_artist)

**Extracting the IDs from the Spotify API.**

In [14]:
top100_track_ids = []
for i in range(0, 100):
    #print(i, top100_df.title.loc[i], len(top100_df.simple_artist.loc[i]))
    results = sp.search(q='track:'+top100_df.title.loc[i]
                        +' '+
                        'artist:'
                        +top100_df["simple_artist"].loc[i]
                        +'*' ,
                        type='track')
    track_id = results['tracks']["items"][0]["id"]
    top100_track_ids.append(track_id)

In [15]:
top100_df["id"] = top100_track_ids

In [16]:
top100_df.head()

Unnamed: 0,title,artist,simple_artist,id
0,Sucker,Jonas Brothers,Jonas Brothers,4y3OI86AEP6PQoDE6olYhO
1,7 Rings,Ariana Grande,Ariana Grande,14msK75pk3pA33pzPVNtBF
2,Please Me,Cardi B & Bruno Mars,Cardi B,0PG9fbaaHFHfre2gUVo7AN
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee,Post Malone,3KkXRkHbMCARz0aVfEt68P
4,Without Me,Halsey,Halsey,5p7ujcrUXASCNwRaWNHR1C


**Extracting the audio features (e.g. danceability) of each song in the top 100 using the Spotify API.**

In [17]:
features = []
for i in range(0, 100):
    feature = sp.audio_features(top100_df.id.loc[i])
    features.append(feature[0])

In [18]:
feature100_df = pd.DataFrame(features)

In [19]:
feature100_df["year"] = 2019

In [20]:
top100_df = top100_df.merge(feature100_df, on="id")

In [21]:
top100_df.head()

Unnamed: 0,title,artist,simple_artist,id,acousticness,analysis_url,danceability,duration_ms,energy,instrumentalness,...,loudness,mode,speechiness,tempo,time_signature,track_href,type,uri,valence,year
0,Sucker,Jonas Brothers,Jonas Brothers,4y3OI86AEP6PQoDE6olYhO,0.0436,https://api.spotify.com/v1/audio-analysis/4y3O...,0.846,181040,0.731,0.0,...,-5.027,0,0.064,137.947,4,https://api.spotify.com/v1/tracks/4y3OI86AEP6P...,audio_features,spotify:track:4y3OI86AEP6PQoDE6olYhO,0.933,2019
1,7 Rings,Ariana Grande,Ariana Grande,14msK75pk3pA33pzPVNtBF,0.578,https://api.spotify.com/v1/audio-analysis/14ms...,0.725,178640,0.321,0.0,...,-10.744,0,0.323,70.142,4,https://api.spotify.com/v1/tracks/14msK75pk3pA...,audio_features,spotify:track:14msK75pk3pA33pzPVNtBF,0.319,2019
2,Please Me,Cardi B & Bruno Mars,Cardi B,0PG9fbaaHFHfre2gUVo7AN,0.0642,https://api.spotify.com/v1/audio-analysis/0PG9...,0.747,200890,0.57,0.0,...,-6.711,1,0.081,133.992,4,https://api.spotify.com/v1/tracks/0PG9fbaaHFHf...,audio_features,spotify:track:0PG9fbaaHFHfre2gUVo7AN,0.65,2019
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee,Post Malone,3KkXRkHbMCARz0aVfEt68P,0.556,https://api.spotify.com/v1/audio-analysis/3KkX...,0.76,158040,0.479,0.0,...,-5.574,1,0.0466,89.911,4,https://api.spotify.com/v1/tracks/3KkXRkHbMCAR...,audio_features,spotify:track:3KkXRkHbMCARz0aVfEt68P,0.913,2019
4,Without Me,Halsey,Halsey,5p7ujcrUXASCNwRaWNHR1C,0.297,https://api.spotify.com/v1/audio-analysis/5p7u...,0.752,201661,0.488,9e-06,...,-7.05,1,0.0705,136.041,4,https://api.spotify.com/v1/tracks/5p7ujcrUXASC...,audio_features,spotify:track:5p7ujcrUXASCNwRaWNHR1C,0.533,2019


**Extracting the number of segments and sections for the top 100 songs as we did for our training data set.**

In [23]:
rows = []
for identifier in top100_df.id:
    feats = sp.audio_analysis(identifier)
    segments_len = len(feats["segments"])
    features_len = len(feats["sections"])
    new_row = ({"num_segments":segments_len, "num_sections":features_len})
    rows.append(new_row)
df_lens = pd.DataFrame(rows)
    

**Within each segment we gathered interesting information such as `loudness_start_avg` (dB), `loudness_max_time_avg` (dB),`loudness_max_avg` (dB), `loudness_max_avg` (dB) and `duration_avg`, as we did for our training data.**

In [24]:
from pandas.io.json import json_normalize
audio_analysist_list = []
for identifier in top100_df.id:
    feats = sp.audio_analysis(identifier)
    duration = (feats["segments"])
    segment_df = json_normalize(feats["segments"])
    loudness_start_avg = segment_df["loudness_start"].mean()
    loudness_max_time_avg = segment_df["loudness_max_time"].mean()
    loudness_max_avg = segment_df["loudness_max"].mean()
    loudness_end_avg = segment_df["loudness_end"].mean()
    duration_avg = segment_df["duration"].mean()
    
    new_row = ({"loudness_start_avg":loudness_start_avg,
                "loudness_max_time_avg":loudness_max_time_avg,
                "loudness_max_avg":loudness_max_avg,
                "loudness_end_avg":loudness_end_avg,
                "duration_avg":duration_avg
    })
    audio_analysist_list.append(new_row)
df_audio_analysis = pd.DataFrame(audio_analysist_list)
    

**Computing the number of key changes for each of the top 100 songs, as we did for our training data.**

In [25]:
import time
key_analysis_list = []
for identifier in top100_df.id:
    
        features = sp.audio_analysis(identifier)
        
        segment_df = json_normalize(features["sections"])
        keyCount = 0
        
        key_list = list(segment_df["key"])
        
        tempKey = key_list[0]
        for key in key_list:
            if key != tempKey:
                tempKey = key
                keyCount += 1
    
        new_row = ({"key_changes": keyCount,
                    })
        key_analysis_list.append(new_row)
    
        
        time.sleep(0.1)

df_key_analysis = pd.DataFrame(key_analysis_list)
df_key_analysis.head()

Unnamed: 0,key_changes
0,6
1,1
2,5
3,3
4,7


**Combining all of our little data frames into one.**

In [26]:
top100_df = pd.concat([top100_df, df_lens, df_audio_analysis, df_key_analysis], axis=1)

In [27]:
pickle.dump(top100_df, open("top100_df.pkl", "wb"))

In [28]:
top100_df.head()

Unnamed: 0,title,artist,simple_artist,id,acousticness,analysis_url,danceability,duration_ms,energy,instrumentalness,...,valence,year,num_sections,num_segments,duration_avg,loudness_end_avg,loudness_max_avg,loudness_max_time_avg,loudness_start_avg,key_changes
0,Sucker,Jonas Brothers,Jonas Brothers,4y3OI86AEP6PQoDE6olYhO,0.0436,https://api.spotify.com/v1/audio-analysis/4y3O...,0.846,181040,0.731,0.0,...,0.933,2019,12,732,0.247322,-60.0,-5.140011,0.052402,-16.998504,6
1,7 Rings,Ariana Grande,Ariana Grande,14msK75pk3pA33pzPVNtBF,0.578,https://api.spotify.com/v1/audio-analysis/14ms...,0.725,178640,0.321,0.0,...,0.319,2019,4,655,0.272733,-60.0,-12.369545,0.067081,-23.344505,1
2,Please Me,Cardi B & Bruno Mars,Cardi B,0PG9fbaaHFHfre2gUVo7AN,0.0642,https://api.spotify.com/v1/audio-analysis/0PG9...,0.747,200890,0.57,0.0,...,0.65,2019,11,820,0.244987,-60.0,-6.354572,0.059532,-16.06294,5
3,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee,Post Malone,3KkXRkHbMCARz0aVfEt68P,0.556,https://api.spotify.com/v1/audio-analysis/3KkX...,0.76,158040,0.479,0.0,...,0.913,2019,8,597,0.264724,-60.0,-6.413338,0.057229,-16.804481,3
4,Without Me,Halsey,Halsey,5p7ujcrUXASCNwRaWNHR1C,0.297,https://api.spotify.com/v1/audio-analysis/5p7u...,0.752,201661,0.488,9e-06,...,0.533,2019,11,732,0.275493,-60.0,-8.837766,0.062921,-18.74872,7
