In [28]:
import requests
from bs4 import BeautifulSoup as bs
import json
import lyricwikia

Scraping the year end "Hot" charts for each genre. These charts factor in physical sales, radio airplay and streams and as they are the 100 most popular songs of the year should be a good represenation of each genre.

Not using the Latin or International charts as some of the songs not being in English would skew the results.

There is a "pop" specific chart on Billboard but using the overall "hot 100" chart for pop instead as the their pop songs chart only includes radio play. 

The charts being used are:
- https://www.billboard.com/charts/year-end/2017/hot-100-songs
- https://www.billboard.com/charts/year-end/2017/hot-rock-songs
- https://www.billboard.com/charts/year-end/2017/hot-country-songs
- https://www.billboard.com/charts/year-end/2017/hot-r-and-and-b-hip-hop-songs
- https://www.billboard.com/charts/year-end/2017/hot-dance-electronic--songs
- https://www.billboard.com/charts/year-end/2017/hot-christian-songs

In [29]:
#different charts go back differing amounts of years
#They all go back to 2013 or without dance/elec they go back to at least 2006
yearAsString = "2017" 

charts = [{'name': 'Overall', 'urlTag':'100'},
          {'name': 'Rock', 'urlTag': 'rock'},
          {'name': 'Country', 'urlTag': 'country'},
          {'name': 'R&B/Hip-Hop', 'urlTag': 'r-and-and-b-hip-hop'},
          {'name': 'Dance/Electronic', 'urlTag': 'dance-electronic-'},          
          {'name': 'Christian', 'urlTag': 'christian'}]

#add full urls
for chart in charts:
    chart['url'] = ("https://www.billboard.com/charts/year-end/" + 
                         "/" + yearAsString + 
                         "/hot-" + chart['urlTag'] + "-songs")

#test the strings are being created right
for i in charts:
    print(i['url'])

https://www.billboard.com/charts/year-end//2017/hot-100-songs
https://www.billboard.com/charts/year-end//2017/hot-rock-songs
https://www.billboard.com/charts/year-end//2017/hot-country-songs
https://www.billboard.com/charts/year-end//2017/hot-r-and-and-b-hip-hop-songs
https://www.billboard.com/charts/year-end//2017/hot-dance-electronic--songs
https://www.billboard.com/charts/year-end//2017/hot-christian-songs


Testing to get chart entries from just one table first to get it working:

In [31]:
r = requests.get(charts[1]['url'])
soup = bs(r.content, "lxml")
chartEntries = soup.find_all("div", attrs={"class": "ye-chart-item__primary-row"})

#chart should have 100 entries in it
assert len(chartEntries) == 100

In [33]:
print(chartEntries[0])

<div class="ye-chart-item__primary-row" data-chart-info-url="/fe_data/charts/year-end/2017/hot-rock-songs/other-charts/1" data-date="2017">
<div class="ye-chart-item__rank">
1
</div>
<div class="ye-chart-item__image">
<img alt="" class="" sizes="(max-width: 1023px) 53px, (min-width: 1024px) 87px" src="https://charts-static.billboard.com/img/1840/12/imagine-dragons-hy6-53x53.jpg" srcset="https://charts-static.billboard.com/img/1840/12/imagine-dragons-hy6-53x53.jpg 53w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-106x106.jpg 106w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-87x87.jpg 87w, https://charts-static.billboard.com/img/2017/02/imagine-dragons-hy6-174x174.jpg 174w"/> </div>
<div class="ye-chart-item__text">
<div class="ye-chart-item__title">
Believer
</div>
<div class="ye-chart-item__artist">
Imagine Dragons
</div>
</div>
<div class="ye-chart-item__expand-caret">
<span class="fa fa-chevron-down"></span>
<span class="fa fa-chevron-up

In [34]:
print("rank: ", int(chartEntries[0].find("div", attrs={"class": "ye-chart-item__rank"}).text))
print("song: ", chartEntries[0].find("div", attrs={"class": "ye-chart-item__title"}).text.strip())
print("artist: ", chartEntries[0].find("div", attrs={"class": "ye-chart-item__artist"}).text.strip())

rank:  1
song:  Believer
artist:  Imagine Dragons


In [35]:
def GetChartEntries(url):
    '''
    Returns a list of dictionaries with the rank, title and artist of each song 
    from the chart on the passed url page
    '''
    
    r = requests.get(url)
    soup = bs(r.content, "lxml")
    chartEntries = soup.find_all("div", attrs={"class": "ye-chart-item__primary-row"})

    chart = []
    for entry in chartEntries:
        chart.append({'rank': int(entry.find("div", attrs={"class": "ye-chart-item__rank"}).text),
                      'song': entry.find("div", attrs={"class": "ye-chart-item__title"}).text.strip(),
                      'artist': entry.find("div", attrs={"class": "ye-chart-item__artist"}).text.strip()
                     })
        
    return chart

In [36]:
x = GetChartEntries(charts[0]['url'])
for i in x:
    print(i)

{'rank': 1, 'song': 'Shape Of You', 'artist': 'Ed Sheeran'}
{'rank': 2, 'song': 'Despacito', 'artist': 'Luis Fonsi & Daddy Yankee Featuring Justin Bieber'}
{'rank': 3, 'song': "That's What I Like", 'artist': 'Bruno Mars'}
{'rank': 4, 'song': 'Humble.', 'artist': 'Kendrick Lamar'}
{'rank': 5, 'song': 'Something Just Like This', 'artist': 'The Chainsmokers & Coldplay'}
{'rank': 6, 'song': 'Bad And Boujee', 'artist': 'Migos Featuring Lil Uzi Vert'}
{'rank': 7, 'song': 'Closer', 'artist': 'The Chainsmokers Featuring Halsey'}
{'rank': 8, 'song': 'Body Like A Back Road', 'artist': 'Sam Hunt'}
{'rank': 9, 'song': 'Believer', 'artist': 'Imagine Dragons'}
{'rank': 10, 'song': 'Congratulations', 'artist': 'Post Malone Featuring Quavo'}
{'rank': 11, 'song': "Say You Won't Let Go", 'artist': 'James Arthur'}
{'rank': 12, 'song': "I'm The One", 'artist': 'DJ Khaled Featuring Justin Bieber, Quavo, Chance The Rapper & Lil Wayne'}
{'rank': 13, 'song': 'XO TOUR Llif3', 'artist': 'Lil Uzi Vert'}
{'rank':

Now get entries for all charts

In [37]:
for chart in charts:
    chart['entries'] = GetChartEntries(chart['url'])

Check content is all there

In [67]:
#test all charts are the same length and have 100 entries
chartLengths = set()
for chart in charts:
    chartLengths.add(len(chart['entries']))
assert len(chartLengths) == 1, 'Should only be one value in set as all charts should have the same length.'
assert list(chartLengths)[0] == 100, 'All charts should have 100 entries'

for chart in charts:
    print('===================')
    print(chart['name'])
    print('===================')
    for song in chart['entries']:
        print(song)
    print('\n\n')

Overall
{'rank': 1, 'song': 'Shape Of You', 'artist': 'Ed Sheeran'}
{'rank': 2, 'song': 'Despacito', 'artist': 'Luis Fonsi & Daddy Yankee Featuring Justin Bieber'}
{'rank': 3, 'song': "That's What I Like", 'artist': 'Bruno Mars'}
{'rank': 4, 'song': 'Humble.', 'artist': 'Kendrick Lamar'}
{'rank': 5, 'song': 'Something Just Like This', 'artist': 'The Chainsmokers & Coldplay'}
{'rank': 6, 'song': 'Bad And Boujee', 'artist': 'Migos Featuring Lil Uzi Vert'}
{'rank': 7, 'song': 'Closer', 'artist': 'The Chainsmokers Featuring Halsey'}
{'rank': 8, 'song': 'Body Like A Back Road', 'artist': 'Sam Hunt'}
{'rank': 9, 'song': 'Believer', 'artist': 'Imagine Dragons'}
{'rank': 10, 'song': 'Congratulations', 'artist': 'Post Malone Featuring Quavo'}
{'rank': 11, 'song': "Say You Won't Let Go", 'artist': 'James Arthur'}
{'rank': 12, 'song': "I'm The One", 'artist': 'DJ Khaled Featuring Justin Bieber, Quavo, Chance The Rapper & Lil Wayne'}
{'rank': 13, 'song': 'XO TOUR Llif3', 'artist': 'Lil Uzi Vert'}


In [89]:
#save off to a file to have a frozen backup incase website changes format in future
#with open('charts.json', 'w') as outfile:
#    json.dump(charts, outfile)