# UltimateGuitarTabs Scraper

The following notebook scrapes the top 1000 most popular tabs posting on https://www.ultimate-guitar.com/

After scraping, it stores the data found into a SQLite3 database ('../data/UltimateGuitarTabs.db') with the following tables:
1. Tab_Data (id, song, artist, is_acoustic, tab_url)
    - id: Unique tab ID 
    - song: Name of song
    - artist: Name of artist
    - is_acoustic: Song is played acoustically (1), otherwise (0)
    - tab_url: URL to tab
2. Artists (name, url)
    - name: Name of artist
    - url: URL to artist profile
3. Hits (id, num_hits, votes, rating)
    - id: Unique tab ID
    - num_hits: # of times tab visited
    - votes: # number of votes received
    - rating: Avg rating out of 5
4. Chords (song, artist, tonality, capo, chords)
    - song: Name of song
    - artist: Name of artist
    - tonality: Original of song
    - capo: Fret to place capo
    - chords: String of full chord progression

Originally scraped on June 5, 2018
Rescraped on March 5, 2021

In [1]:
import sqlite3
import re
import json
import pandas as pd
import time
from bs4 import BeautifulSoup

In [2]:
def parse_data_content_from_url(url, max_retries=5):
    """
    This function parses the content on the given URL webpage and grabs the "data-content" 
    from the <div class='js-store'> tag. The data stored in this object includes all content 
    hosted on the webpage.
    """
    html = ''
    soup = ''
    attempt_num = 1
    while html == '' or soup == '' or attempt_num > max_retries:
        try:
            print('Attempting to reach link: {}'.format(url))
            html = simple_get(url)
            soup = BeautifulSoup(html, 'html.parser')
            data_content = json.loads(soup.find("div", {'class':'js-store'})['data-content'])
            return(data_content)
        except:
            attempt_num+=1
            print('Connection refused on server for link: {}'.format(url))
            print('Retrying for attempt {0}/{1} in 5 seconds'.format(attempt_num, max_retries))
            time.sleep(5)
            continue        
            
            
def parse_chords_from_url(url):
    """
    Scrapes and returns the sequences of 
    chords as a list as well as the fret number
    to place a capo. 
    """

    data = parse_data_content_from_url(url)

    chords = data['store']['page']['data']['tab_view']['wiki_tab']['content']
    
    # Matching groups (open tag)(chord pitch)(base note {0 or 1})(chord type)(base note {0 or 1})(closing tag)
    pattern = "(\[ch\])([A-G]+)(\/[A-G]*[b#])*([(?m)|(?m\d)|(?b\d)|(?#\d)|(?maj\d)|(?add\d)|(?sus\d)|(?aug)|(?aug\d)|(?dim)|(?dim\d)]*)(\/[A-G]*[b#])*(\[\/ch\])"
    prog = re.compile(pattern)
    result = prog.findall(chords)
    
    cleaned_res = result
    for i in range(len(result)):
        # Grabbing groups (chord pitch)(base note)(chord type)(base note)
        cleaned_res[i] = result[i][1] + result[i][2] + result[i][3] + result[i][4]
       
    # Grabbing Capo info
    capo = 0
    try:
        capo = data['store']['page']['data']['tab_view']['meta']['capo']
    except:
        capo = 0
        
    return(cleaned_res, capo)
            
def get_metadata_from_top_page(url):
    data_content = parse_data_content_from_url(url)
    tabs = data_content['store']['page']['data']['data']['tabs']
    hits = data_content['store']['page']['data']['data']['hits']
    
    return(tabs, hits)    

def get_multiple_pages(url, n):
    """
    Creates functionality to scrape multiple
    pages up to n
    """
    page_suffix = "&page="
    tabs_list = []
    hits_list = []
    
    for i in range(n):
        cur_tabs, cur_hits = get_metadata_from_top_page(url + page_suffix + str(i+1))
        
        tabs_list += cur_tabs
        hits_list += cur_hits
        
    return(tabs_list, hits_list)

def parse_tab_fields(tab_obj, hit_obj):
    tab_url = tab_obj['tab_url']
    chords, capo = parse_chords_from_url(tab_url)
    tab_dict = {
        'tab_id': int(tab_obj['id']),
        'song_name': tab_obj['song_name'],
        'artist': tab_obj['artist_name'],
        'tonality': tab_obj['tonality_name'],
        'votes': int(tab_obj['votes']),
        'rating': float(tab_obj['rating']),
        'is_acoustic': int(tab_obj['recording']['is_acoustic']),
        'tab_url': tab_obj['tab_url'],
        'artist_url': tab_obj['artist_url'],
        'hit_id': int(hit_obj['id']),
        'hit_num': int(hit_obj['hits']),
        'chords': ','.join(chords),
        'capo': capo
    }
    return tab_dict

def create_sqlite_connector(db='UltimateGuitarTabs.db'):
    con = sqlite3.connect('../data/{}'.format(db))
    cur = con.cursor()
    return(con, cur)

def drop_tables_if_exists(con, cur):
    delete_tabs = "DROP TABLE IF EXISTS Tab_Data;"
    cur.execute(delete_tabs)
    con.commit()
    
    delete_artists = "DROP TABLE IF EXISTS Artists;"
    cur.execute(delete_artists)
    con.commit()
    
    delete_hits = "DROP TABLE IF EXISTS Hits;"
    cur.execute(delete_hits)
    con.commit()
    
    delete_chords = "DROP TABLE IF EXISTS Chords;"
    cur.execute(delete_chords)
    con.commit()
    
def create_tables(con, cur):
    create_tabs = """
        CREATE TABLE Tab_Data (
            id integer primary key,
            song text,
            artist text,
            is_acoustic integer,
            tab_url text
        )"""
    cur.execute(create_tabs)
    con.commit()    

    create_artists = """
        CREATE TABLE Artists (
            name text,
            url text
        )"""
    cur.execute(create_artists)
    con.commit()


    create_hits = """
        CREATE TABLE Hits (
            id integer primary key, 
            num_hits integer, 
            votes integer,
            rating float
        )"""
    cur.execute(create_hits)
    con.commit()    

    create_chords = """
        CREATE TABLE Chords (
        id integer primary key,
        song text,
        artist text,
        tonality text,
        capo integer,
        chords text
        )"""
    cur.execute(create_chords)  
    con.commit()
    
def insert_data(tab_dict, con, cur):
    sql_tab = "insert into Tab_Data (id,song,artist,is_acoustic,tab_url) \
    VALUES ('%d','%s','%s','%d','%s')" % \
        (tab_dict['tab_id'], tab_dict['song_name'], tab_dict['artist'],
         tab_dict['is_acoustic'], tab_dict['tab_url'])

    sql_artist = "insert into Artists (name, url) VALUES ('%s','%s')" %\
        (tab_dict['artist'], tab_dict['artist_url'])

    sql_hit = "insert into Hits (id, num_hits, votes, rating) VALUES ('%d','%d','%d','%.8f')" %\
        (tab_dict['tab_id'], tab_dict['hit_num'],
         tab_dict['votes'], tab_dict['rating'])

    sql_chords = "insert into Chords (id, song, artist, tonality, capo, chords) VALUES('%d','%s','%s','%s','%d','%s')" % \
        (tab_dict['tab_id'], tab_dict['song_name'],
         tab_dict['artist'], tab_dict['tonality'],
         int(tab_dict['capo']),
         ','.join(tab_dict['chords']))

    cur.execute(sql_tab)
    con.commit()
    
    cur.execute(sql_artist)
    con.commit()
    
    cur.execute(sql_hit)
    con.commit()
    
    cur.execute(sql_chords)
    con.commit()
    
def scrape_ultimate_guitar(url='https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords', pages=20):
    con, cur = create_sqlite_connector()
    
    print('Dropping sqlite tables')
    drop_tables_if_exists(con, cur)
    
    print('Creating sqlite tables')
    create_tables(con, cur) 

    print('Parsing pages')
    tabs_list, hits_list = get_multiple_pages('https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords', 20)    
    
    print('Parsing tabs for each page')
    for i in range(len(tabs_list)):
        print('Parsing tab #{}'.format(i))
        tab_dict = parse_tab_fields(tabs_list[i], hits_list[i])
        insert_data(tab_dict, con, cur)
        
    con.close()    

In [3]:
scrape_ultimate_guitar()

Dropping sqlite tables
Creating sqlite tables
Parsing pages
Attempting to reach link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Connection refused on server for link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Retrying for attempt 2/10 in 5 seconds
Attempting to reach link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Connection refused on server for link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Retrying for attempt 3/10 in 5 seconds
Attempting to reach link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Connection refused on server for link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Retrying for attempt 4/10 in 5 seconds
Attempting to reach link: https://www.ultimate-guitar.com/explore?order=hitstotal_desc&type[]=Chords&page=1
Connection refused on server

KeyboardInterrupt: 