In [72]:
import requests
from bs4 import BeautifulSoup as bs
import json
from IPython.display import clear_output
import numpy as np
import pandas as pd

from config import client_access_token
import lyricsgenius as genius
geniusAPI = genius.Genius(client_access_token)

Scraping the year end "Hot" charts for each genre. These charts factor in physical sales, radio airplay and streams and as they are the 100 most popular songs of the year should be a good represenation of each genre.

Not using the Latin or International charts as some of the songs not being in English would skew the results.

There is a "pop" specific chart on Billboard but using the overall "hot 100" chart for pop instead as the their pop songs chart only includes radio play so is different from the other charts I'm using. The year end chart also only has 50 songs Vs 100 for the others so using the overall hot 100 allows better consistency.

The charts being used are:
- https://www.billboard.com/charts/year-end/2017/hot-100-songs
- https://www.billboard.com/charts/year-end/2017/hot-rock-songs
- https://www.billboard.com/charts/year-end/2017/hot-country-songs
- https://www.billboard.com/charts/year-end/2017/hot-r-and-and-b-hip-hop-songs
- https://www.billboard.com/charts/year-end/2017/hot-dance-electronic--songs
- https://www.billboard.com/charts/year-end/2017/hot-christian-songs

# Scrape Charts

In [9]:
#different charts go back differing amounts of years
#They all go back to 2013 or without dance/elec they go back to at least 2006
yearAsString = "2017" 

charts = [{'name': 'Overall', 'urlTag':'100'},
          {'name': 'Rock', 'urlTag': 'rock'},
          {'name': 'Country', 'urlTag': 'country'},
          {'name': 'R&B/Hip-Hop', 'urlTag': 'r-and-and-b-hip-hop'},
          {'name': 'Dance/Electronic', 'urlTag': 'dance-electronic-'},          
          {'name': 'Christian', 'urlTag': 'christian'}]

#add full urls
for chart in charts:
    chart['url'] = ("https://www.billboard.com/charts/year-end/" + 
                         "/" + yearAsString + 
                         "/hot-" + chart['urlTag'] + "-songs")

#test the strings are being created right
for i in charts:
    print(i['url'])

https://www.billboard.com/charts/year-end//2017/hot-100-songs
https://www.billboard.com/charts/year-end//2017/hot-rock-songs
https://www.billboard.com/charts/year-end//2017/hot-country-songs
https://www.billboard.com/charts/year-end//2017/hot-r-and-and-b-hip-hop-songs
https://www.billboard.com/charts/year-end//2017/hot-dance-electronic--songs
https://www.billboard.com/charts/year-end//2017/hot-christian-songs


Testing to get chart entries from just one table first to get it working:

In [10]:
r = requests.get(charts[1]['url'])
soup = bs(r.content, "lxml")
chartEntries = soup.find_all("div", attrs={"class": "ye-chart-item__primary-row"})

#chart should have 100 entries in it
assert len(chartEntries) == 100

In [11]:
print(chartEntries[0])

<div class="ye-chart-item__primary-row" data-chart-info-url="/fe_data/charts/year-end/2017/hot-rock-songs/other-charts/1" data-date="2017">
<div class="ye-chart-item__rank">
1
</div>
<div class="ye-chart-item__image">
<img alt="" class="" sizes="(max-width: 1023px) 53px, (min-width: 1024px) 87px" src="https://charts-static.billboard.com/img/1840/12/imagine-dragons-hy6-53x53.jpg" srcset="https://charts-static.billboard.com/img/1840/12/imagine-dragons-hy6-53x53.jpg 53w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-106x106.jpg 106w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-87x87.jpg 87w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-174x174.jpg 174w"/>
</div>
<div class="ye-chart-item__text">
<div class="ye-chart-item__title">
Believer
</div>
<div class="ye-chart-item__artist">
Imagine Dragons
</div>
</div>
<div class="ye-chart-item__expand-caret">
<span class="fa fa-chevron-down"></span>
<span class="fa fa-chevron-up

In [12]:
print("rank: ", int(chartEntries[0].find("div", attrs={"class": "ye-chart-item__rank"}).text))
print("song: ", chartEntries[0].find("div", attrs={"class": "ye-chart-item__title"}).text.strip())
print("artist: ", chartEntries[0].find("div", attrs={"class": "ye-chart-item__artist"}).text.strip())

rank:  1
song:  Believer
artist:  Imagine Dragons


In [13]:
def getChartEntries(url):
    '''
    Returns a list of dictionaries with the rank, title and artist of each song 
    from the chart on the passed url page
    '''
    
    r = requests.get(url)
    soup = bs(r.content, "lxml")
    chartEntries = soup.find_all("div", attrs={"class": "ye-chart-item__primary-row"})

    chart = []
    for entry in chartEntries:
        chart.append({'rank': int(entry.find("div", attrs={"class": "ye-chart-item__rank"}).text),
                      'song': entry.find("div", attrs={"class": "ye-chart-item__title"}).text.strip(),
                      
                      #replacing 'x' and 'X' with '&' as that's how genius.com has the names
                      #need to match so the lyrics be looked up later
                      'artist': entry.find("div", 
                                           attrs={"class": "ye-chart-item__artist"}).text.strip()\
                                              .replace(' x ', ' & ').replace(' X ', ' & ')
                     })
        
    return chart

In [14]:
#look at what songs are in the first chart
x = getChartEntries(charts[0]['url'])
for i in x:
    print(i)

{'rank': 1, 'song': 'Shape Of You', 'artist': 'Ed Sheeran'}
{'rank': 2, 'song': 'Despacito', 'artist': 'Luis Fonsi & Daddy Yankee Featuring Justin Bieber'}
{'rank': 3, 'song': "That's What I Like", 'artist': 'Bruno Mars'}
{'rank': 4, 'song': 'Humble.', 'artist': 'Kendrick Lamar'}
{'rank': 5, 'song': 'Something Just Like This', 'artist': 'The Chainsmokers & Coldplay'}
{'rank': 6, 'song': 'Bad And Boujee', 'artist': 'Migos Featuring Lil Uzi Vert'}
{'rank': 7, 'song': 'Closer', 'artist': 'The Chainsmokers Featuring Halsey'}
{'rank': 8, 'song': 'Body Like A Back Road', 'artist': 'Sam Hunt'}
{'rank': 9, 'song': 'Believer', 'artist': 'Imagine Dragons'}
{'rank': 10, 'song': 'Congratulations', 'artist': 'Post Malone Featuring Quavo'}
{'rank': 11, 'song': "Say You Won't Let Go", 'artist': 'James Arthur'}
{'rank': 12, 'song': "I'm The One", 'artist': 'DJ Khaled Featuring Justin Bieber, Quavo, Chance The Rapper & Lil Wayne'}
{'rank': 13, 'song': 'XO TOUR Llif3', 'artist': 'Lil Uzi Vert'}
{'rank':

Now get entries for all charts

In [16]:
for chart in charts:
    chart['entries'] = getChartEntries(chart['url'])

Check content is all there

In [18]:
#test all charts are the same length and have 100 entries
chartLengths = set()
for chart in charts:
    chartLengths.add(len(chart['entries']))
assert len(chartLengths) == 1, 'Should only be one value in set as all charts should have the same length.'
assert list(chartLengths)[0] == 100, 'All charts should have 100 entries'

for chart in charts:
    print('===================')
    print(chart['name'])
    print('===================')
    for song in chart['entries']:
        print(song)
    print('\n\n')

Overall
{'rank': 1, 'song': 'Shape Of You', 'artist': 'Ed Sheeran'}
{'rank': 2, 'song': 'Despacito', 'artist': 'Luis Fonsi & Daddy Yankee Featuring Justin Bieber'}
{'rank': 3, 'song': "That's What I Like", 'artist': 'Bruno Mars'}
{'rank': 4, 'song': 'Humble.', 'artist': 'Kendrick Lamar'}
{'rank': 5, 'song': 'Something Just Like This', 'artist': 'The Chainsmokers & Coldplay'}
{'rank': 6, 'song': 'Bad And Boujee', 'artist': 'Migos Featuring Lil Uzi Vert'}
{'rank': 7, 'song': 'Closer', 'artist': 'The Chainsmokers Featuring Halsey'}
{'rank': 8, 'song': 'Body Like A Back Road', 'artist': 'Sam Hunt'}
{'rank': 9, 'song': 'Believer', 'artist': 'Imagine Dragons'}
{'rank': 10, 'song': 'Congratulations', 'artist': 'Post Malone Featuring Quavo'}
{'rank': 11, 'song': "Say You Won't Let Go", 'artist': 'James Arthur'}
{'rank': 12, 'song': "I'm The One", 'artist': 'DJ Khaled Featuring Justin Bieber, Quavo, Chance The Rapper & Lil Wayne'}
{'rank': 13, 'song': 'XO TOUR Llif3', 'artist': 'Lil Uzi Vert'}


# Get Lyrics

In [19]:
#test getting lyrics for one song
print(charts[0]['entries'][0]['artist'])
print(charts[0]['entries'][0]['song'])
geniusAPI.search_song(charts[0]['entries'][0]['song'], 
                      charts[0]['entries'][0]['artist']).lyrics

Ed Sheeran
Shape Of You
Searching for "Shape Of You" by Ed Sheeran...
Done.


"[Verse 1]\nThe club isn't the best place to find a lover\nSo the bar is where I go\nMe and my friends at the table doing shots\nDrinking fast and then we talk slow\nAnd you come over and start up a conversation with just me\nAnd trust me I'll give it a chance now\nTake my hand, stop, put Van the Man on the jukebox\nAnd then we start to dance, and now I'm singing like\n\n[Pre-Chorus]\nGirl, you know I want your love\nYour love was handmade for somebody like me\nCome on now, follow my lead\nI may be crazy, don't mind me\nSay, boy, let's not talk too much\nGrab on my waist and put that body on me\nCome on now, follow my lead\nCome, come on now, follow my lead\n\n[Chorus]\nI'm in love with the shape of you\nWe push and pull like a magnet do\nAlthough my heart is falling too\nI'm in love with your body\nAnd last night you were in my room\nAnd now my bed sheets smell like you\nEvery day discovering something brand new\nI'm in love with your body\nOh—I—oh—I—oh—I—oh—I\nI'm in love with your b

In [20]:
def getLyric(song, artist):
    #returns lyrics for a song or "notFound"
    try: 
        lyrics = geniusAPI.search_song(song, artist).lyrics
    except:
        lyrics = False
    
    return lyrics

def getLyrics(charts):
    for chart in charts:
        i = 0
        for song in chart['entries']:
            i+=1
            clear_output()
            print('Chart: ', chart['name']) 
            print('Getting song', i, ':', song['song'])
            
            #try getting lyrics with name as is
            song['lyrics'] = getLyric(song['song'], song['artist'])
            
            #if lyrics not found it's normally because of the featuring artists
            #try splitting on different ways songs add featuring artists to the end of artist names
            #sometimes a combination is used so trying each individually
            artistSplits = ['Featuring', 'With', '&', '/', ',']
            for splitter in artistSplits:
                #if lyrics have been found they'll be a non-empty string which evaluates as true
                if song['lyrics']:
                    break
                    
                song['lyrics'] = getLyric(song['song'], 
                                          song['artist'].split(splitter)[0].strip())
                

            
            #saw a few not found songs with brackets in the name
            #like 'Bodak Yellow (Money Moves)' so try split on open bracket
            if not song['lyrics']:
                song['lyrics'] = getLyric(song['song'].split('(')[0].strip(), 
                                          song['artist'])
    return

- get Lyrics for all songs

In [21]:
getLyrics(charts)

Chart:  Christian
Getting song 100 : Tremble
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(
Searching for "Tremble" by Mosaic MSC...
Specified song was not first result :(


- Check which songs lyrics weren't found for

In [22]:
def printNotFoundSongs(charts):
    totalNotFoundCount = 0
    for chart in charts:
        print("\n=================")
        print(chart['name'])
        print("=================")
        i=0
        for song in chart['entries']:
            if not song['lyrics']:
                i += 1
                totalNotFoundCount += 1
                print(i, ": (rank ", song['rank'], ")", song['song'], " - ", song['artist'])

    print("\n=================")
    print('Total songs lyrics not found for: ', totalNotFoundCount)

In [23]:
#print all the song's lyrics weren't found for
printNotFoundSongs(charts)


Overall
1 : (rank  60 ) Caroline  -  Amine

Rock
1 : (rank  8 ) Sucker For Pain  -  Lil Wayne, Wiz Khalifa & Imagine Dragons With Logic & Ty Dolla $ign Feat. & Ambassadors
2 : (rank  11 ) Human  -  Rag'n'Bone Man
3 : (rank  81 ) Vacation  -  The Dirty Heads

Country
1 : (rank  61 ) Somethin' I'm Good At  -  Brett Eldredge
2 : (rank  67 ) A Girl Like You  -  Easton Corbin
3 : (rank  91 ) California  -  Big & Rich

R&B/Hip-Hop
1 : (rank  37 ) Caroline  -  Amine
2 : (rank  79 ) Wokeuplikethis*  -  Playboi Carti Featuring Lil Uzi Vert
3 : (rank  88 ) F**k Love  -  XXXTENTACION Featuring Trippie Redd
4 : (rank  91 ) Pills And Automobiles  -  Chris Brown Featuring Yo Gotti, A Boogie Wit da Hoodie & Kodak Black

Dance/Electronic
1 : (rank  30 ) More Than You Know  -  Axwell & Ingrosso
2 : (rank  46 ) I Love You  -  Axwell & Ingrosso Featuring Kid Ink
3 : (rank  49 ) Rich Love  -  OneRepublic With Seeb
4 : (rank  57 ) Feel Good  -  Gryffin And Illenium Featuring Daya
5 : (rank  65 ) A Differe

Only 27 songs not found and less than 10 from any one chart. That's less than 5% missing from the 600 total songs. 

This is good enough for me and I'm going to move on instead of continuing to put in effort to get the last few songs.

# Save Off CSV of Results

For tidyness I'm saving the data now and further analysis will done in a seperate analysis workbook.

Create a table of the scrapped data and save off as a csv.

In [78]:
chartsDf = None
for chart in charts:
    tempDf = pd.DataFrame.from_dict(chart['entries'])
    tempDf['chart'] = chart['name']
    tempDf['chartURL'] = chart['url']
    
    if chartsDf is None:
        chartsDf = tempDf
    else:
        chartsDf = pd.concat([chartsDf, tempDf])

#resetting index as is currently 0-99 repeating 6 times
#after reset will just be 0-599
chartsDf.reset_index(inplace=True, drop=True)

In [79]:
#tidy up table

#reorder columns
chartsDf = chartsDf[['chart', 
                     'chartURL', 
                     'rank', 
                     'song', 
                     'artist', 
                     'lyrics']]

#value for missing lyrics is currently False, change to 
#np.nan as it's a more common value for missing measures
chartsDf.lyrics = chartsDf.lyrics.apply(lambda x: x if x else np.nan)

chartsDf

Unnamed: 0,chart,chartURL,rank,song,artist,lyrics
0,Overall,https://www.billboard.com/charts/year-end//201...,1,Shape Of You,Ed Sheeran,[Verse 1]\nThe club isn't the best place to fi...
1,Overall,https://www.billboard.com/charts/year-end//201...,2,Despacito,Luis Fonsi & Daddy Yankee Featuring Justin Bieber,"[Intro: Luis Fonsi & Daddy Yankee]\nAy, ¡Fonsi..."
2,Overall,https://www.billboard.com/charts/year-end//201...,3,That's What I Like,Bruno Mars,"[Verse 1]\nHey, hey, hey\nI got a condo in Man..."
3,Overall,https://www.billboard.com/charts/year-end//201...,4,Humble.,Kendrick Lamar,[Intro]\nNobody pray for me\nIt's been that da...
4,Overall,https://www.billboard.com/charts/year-end//201...,5,Something Just Like This,The Chainsmokers & Coldplay,[Verse 1: Chris Martin]\nI've been reading boo...
5,Overall,https://www.billboard.com/charts/year-end//201...,6,Bad And Boujee,Migos Featuring Lil Uzi Vert,"[Intro: Offset]\nYou know, young rich niggas\n..."
6,Overall,https://www.billboard.com/charts/year-end//201...,7,Closer,The Chainsmokers Featuring Halsey,"[Verse 1: Andrew Taggart]\nHey, I was doing ju..."
7,Overall,https://www.billboard.com/charts/year-end//201...,8,Body Like A Back Road,Sam Hunt,"[Verse 1]\nGot a girl from the south side, got..."
8,Overall,https://www.billboard.com/charts/year-end//201...,9,Believer,Imagine Dragons,[Verse 1]\nFirst things first\nI'ma say all th...
9,Overall,https://www.billboard.com/charts/year-end//201...,10,Congratulations,Post Malone Featuring Quavo,"[Intro: Post Malone]\nMm-mmm\nYah, yah\nMm-mmm..."


In [80]:
chartsDf.to_csv('charts_and_lyrics_2017.csv', index=False)