In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import re
import json
import pandas as pd
import time

In [2]:
def simple_get(url):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    Source: https://realpython.com/python-web-scraping-practical-introduction/
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

def get_data(url):
    html = ''
    soup = ''
    while html == '' or soup == '':
        try:
            html = simple_get(url)
            soup = BeautifulSoup(html, 'html.parser')
            #Data is stored as JSON on the page
            script = soup.find('script', text=re.compile('window.UGAPP.store.page'))

            # Removes unnecessary text
            json_text = re.search(r'^\s*window\.UGAPP\.store\.page\s*=\s*({.*?})\s*;\s*$', \
                              script.string, flags=re.DOTALL | re.MULTILINE).group(1)
            data = json.loads(json_text)
            return(data)
        except:
            print('Connection refused on server')
            print(url)
            time.sleep(5)
            continue

def get_tabs_data(url):
    """
    Creates the bs4 object and extracts a list
    of tab info. Hits info is stored as a separate 
    list in the html file so it is returned separately.
    """
    
    data = get_data(url)
    tab_links = data['data']['data']['tabs']       
    return(tab_links)

def get_chords(url):
    """
    Scrapes and returns the sequences of 
    chords as a list as well as the fret number
    to place a capo. 
    """

    data = get_data(url)

    chords = data['data']['tab_view']['wiki_tab']['content']
    
    # Matching groups (open tag)(chord pitch)(base note {0 or 1})(chord type)(base note {0 or 1})(closing tag)
    pattern = "(\[ch\])([A-G]+)(\/[A-G]*[b#])*([(?m)|(?m\d)|(?b\d)|(?#\d)|(?maj\d)|(?add\d)|(?sus\d)|(?aug)|(?aug\d)|(?dim)|(?dim\d)]*)(\/[A-G]*[b#])*(\[\/ch\])"
    prog = re.compile(pattern)
    result = prog.findall(chords)
    
    cleaned_res = result
    for i in range(len(result)):
        # Grabbing groups (chord pitch)(base note)(chord type)(base note)
        cleaned_res[i] = result[i][1] + result[i][2] + result[i][3] + result[i][4]
       
    # Grabbing Capo info
    capo = 0
    try:
        capo = data['data']['tab_view']['meta']['capo']
    except:
        capo = 0

    return(cleaned_res, capo, data)
    
def get_genre(url):
    """
    Grabs the artist's categorized genre
    """
    data = get_data(url)
    genre = data['data']['artist']['genre']
    
    return(genre)

def get_multiple_pages(url, n):
    """
    Creates functionality to scrape multiple
    pages up to n
    """
    page_suffix = "&page="
    tabs_list = []
    
    for i in range(n):
        cur_tabs = get_tabs_data(url + page_suffix + str(i))
        
        tabs_list += cur_tabs
        
    return tabs_list

In [3]:
tabs = get_multiple_pages("https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords", 20)

In [4]:
songs = []
artist = []
acoustic = []
votes = []
ratings = []
tonality = []
capos = []
progression = []
date = []
views = []
favorites = []
comments = []
denuminator = []
tempo = []
contributors = []
for i in range(len(tabs)):

    song = tabs[i]    
    chords, capo, data = get_chords(song['tab_url'])
    
    songs.append(song['song_name'])
    artist.append(song['artist_name'])
    tonality.append(song['tonality_name'])
    votes.append(int(song['votes']))
    ratings.append(float(song['rating']))
    acoustic.append(int(song['recording']['is_acoustic']))
    progression.append(','.join(chords))
    capos.append(capo)
    date.append(int(data['data']['tab']['date']))
    views.append(int(data['data']['tab_view']['stats']['view_total']))
    favorites.append(int(data['data']['tab_view']['stats']['favorites_count']))
    comments.append(data['data']['tab_view']['comments'])
#     denuminator.append(json.loads(data['data']['tab_view']['encode_strummings'])['patterns'][0]['denuminator'])
#     tempo.append(json.loads(data['data']['tab_view']['encode_strummings'])['patterns'][0]['bpm'])
    contributors.append(len(data['data']['tab_view']['contributors']))

In [5]:
df = pd.DataFrame({
    'song': songs,
    'artist': artist,
    'date': date,
    'votes': votes,
    'ratings': ratings,
    'views': views,
    'favorites': favorites,
    'comments': comments,
    'contributors': contributors,
    'acoustic': acoustic,   
    'tonality': tonality,
    'capo': capos,
    'chords': progression    
})

In [6]:
df.shape

(1000, 13)

In [7]:
df.head()

Unnamed: 0,song,artist,date,votes,ratings,views,favorites,comments,contributors,acoustic,tonality,capo,chords
0,Hallelujah,Jeff Buckley,1121385600,29967,4.87396,26764899,303706,526,14,0,Db,1,"C,Am,C,Am,C,Am,C,Am,F,G,C,G,C,F,G,Am,F,G,E7,Am..."
1,All Of Me,John Legend,1369872001,14658,4.83602,22382892,340712,538,13,0,Em,1,"Em,C,G,D,Em,Cmaj7,G,D,Em,C,G,D,Em,Cmaj7,G,D,Em..."
2,Let Her Go,Passenger,1331596801,12024,4.84701,20743429,363720,392,24,0,Am,7,"G,F,G,Am,G,F,G,Am,G,F,G,Am,G,F,G,Am,F,C,G,Am,F..."
3,Im Yours,Jason Mraz,1521876577,9289,4.71162,18150659,283048,400,3,0,B,4,"G,D,Em,C,G,D,Em,C,G,D,Em,C,G,D,Em,C,G,D,Em,C,G..."
4,Perfect,Ed Sheeran,1488453515,20055,4.86194,18112916,308953,642,14,0,Ab,1,"G,G,Em,C,D,G,Em,C,D,G,Em,C,G,D,G,Em,C,D,Em,C,G..."


In [8]:
df.to_csv('data/top1000.csv', index=False)