In [2]:
# Open artist index
# Count how many pages there are
# Open the artist page
# 1. Check if there are music/artist tabs
# 1.1 If there are artist tabs ignore them (they probably exist elsewhere and will get picked up)
# 1.2 If there are music tabs ignore them (they probably exist elsewhere and will get picked up)
# 2. Check if there are multiple songs/tracks on the artist page
# 2.1 If it is a track, open the page, get info and write to csv (A single track will not contain a song duration)
# 2.2 if it is an album, open the page, get info and write to csv
# 3. Check if all the artist page is also the album page (The artist page will only contain tracks and nothing else)
# 3.1 Get album information, and write it to a csv
# 4. Something bad most likely happened and you should skip and go to the next artist

##########################################################################################################################
##########################################################################################################################


##########################################################################################################################
# Define Functions to be Used
##########################################################################################################################
# GET_MAX_PAGES(soup)
# Get the maximum page value to use as the end point (This is probably a waste of a function since you only need it 1x)

def get_max_pages(soup):
    n_pages = []
    for page in soup.find_all("a", {"rel": "nofollow"}):
        n_pages.append(int(page.text))
    max_pages = max(n_pages)
    return max_pages


##########################################################################################################################
# GET_TAB_CHECK()
# This function will check if there are multiple tabs on the artist page.
# A return value of 0: There is at least one tab
# A return value of 1 mean there are no tabs

def get_tab_check(asrtist_soup):
    check_1 = artist_soup.find("a", {"href": "/artists"})
    check_2 = artist_soup.find("a", {"href": "/music"})

    if check_1 is None or check_2 is None:
        # If the there is no music OR no Artist Tab
        tab_check = 0

    else:
        # Tabs exist
        tab_check = 1

    return tab_check


##########################################################################################################################
# GET_SINGLE_ALBUM_INFO(soup)
# This function should be called when on an artists page and they only have a single album (multiple tracks)

def get_single_album_data(content_soup):
    artist_song = []
    song_dur = []
    artist_tags = []

    # Get Artist Name
    artist_name = content_soup.find("span", {"itemprop": "byArtist"}).text
    artist_name = artist_name.replace("\n", "")
    artist_name.lstrip().rstrip()
    # print(artist_name)

    # Get Album Name
    album_name = content_soup.find("h2", {"class": "trackTitle"}).text
    album_name = album_name.replace("\n", "")

    # Get Artist Location
    artist_loc = content_soup.find("span", {"class": "location secondaryText"}).text
    if artist_loc == "":
        artist_loc = "N/A"

    # Get the date
    try:
        album_date = content_soup.find("meta", {"itemprop": "datePublished"})["content"]
        album_date = datetime.datetime.strptime(album_date, "%Y%m%d").strftime("%m/%d/%Y")

    except:
        album_date = "N/A"
    print(album_date)
    # Get Album Tags
    for tags in content_soup.find_all("a", {"class": "tag"}):
        artist_tags.append(tags.text)

    # Get Songs
    try:
        content_soup.find_all("span", {"itemprop": "name"})
        for song in content_soup.find_all("span", {"itemprop": "name"}):
            artist_song.append(song.text)
        # artist_song = [artist_song.replace(" ", "") for artist_song in song]
        # artist_song = [artist_song.replace("\n", "") for artist_song in song]
    except:
        artist_song = album_name

    # Get Song Durations
    for x in content_soup.find_all("div", {"class": "title"}):
        try:
            y = x.find("span", {"class": "time secondaryText"})
            song_dur.append(y.text)
        except:
            song_dur.append("N/A")  # Pass N/A if song length is missing

    data = [artist_name, album_name, artist_loc, album_date, artist_tags, artist_song, song_dur]
    return data


##########################################################################################################################
# GET_SINGLE_TRACK_DATA(soup)
# This function should be called when on an artists page and they only have a single track instead of an album as a page

def get_single_track_data(content_soup):
    artist_song = []
    song_dur = []
    artist_tags = []

    # Get Artist Name
    artist_name = content_soup.find("span", {"itemprop": "byArtist"}).text
    artist_name = artist_name.replace("\n", "")
    artist_name = artist_name.lstrip().rstrip()

    # print(artist_name)

    # Get Album Name
    album_name = content_soup.find("h2", {"class": "trackTitle"}).text
    album_name = album_name.replace("\n", "")

    # Get Artist Location
    artist_loc = content_soup.find("span", {"class": "location secondaryText"}).text
    if artist_loc == "":
        artist_loc = "N/A"

    # Get the date
    try:
        album_date = content_soup.find("meta", {"itemprop": "datePublished"})["content"]
        album_date = datetime.datetime.strptime(album_date, "%Y%m%d").strftime("%m/%d/%Y")
    except:
        album_date = "N/A"
    print(album_date)
    # Get Album Tags
    for tags in content_soup.find_all("a", {"class": "tag"}):
        artist_tags.append(tags.text)

        # Get Song
    artist_song = album_name  # Keep the track name the same as the album since

    # Get Song Durations
    song_dur = "N/A"  # Unable to find the duration of the song via beautifulSoup for some reason, so ignore (FIX THIS)

    data = [artist_name, album_name, artist_loc, album_date, artist_tags, artist_song, song_dur]
    return data


##########################################################################################################################
# WRITE_TO_CSV(data)
# Self-explanatory

def write_to_csv(data):
    # Pull out the data from the function
    ARTIST_ = data[0]
    ALBUM_ = data[1]
    LOC_ = data[2]
    DATE_ = data[3]
    TAGS_ = data[4]
    SONG_ = data[5]
    DUR_ = data[6]

    song_array = np.array(SONG_)
    n_songs = song_array.size

    if n_songs > 1:
        for i in range(0, n_songs):
            # myData = writer.writerow({'Artist':ARTIST_,'Album':ALBUM_,'Song':SONG_[i],'Length':DUR_[i],'Tags':TAGS_,'Date':DATE_,'Location':LOC_})
            writer.writerow(
                {'Artist': ARTIST_, 'Album': ALBUM_, 'Song': SONG_[i], 'Length': DUR_[i], 'Tags': TAGS_, 'Date': DATE_,
                 'Location': LOC_})
    elif n_songs == 1:
        myData = writer.writerow(
            {'Artist': ARTIST_, 'Album': str(ALBUM_), 'Song': str(SONG_), 'Length': DUR_, 'Tags': TAGS_, 'Date': DATE_,
             'Location': LOC_})
    else:
        pass
    return


##########################################################################################################################
##########################################################################################################################

# Import all the necessary packages
import urllib.request, urllib.error
from bs4 import BeautifulSoup
import csv
import datetime
import string
import numpy as np

##########################################################################################################################

# Start the main code
artist_index_url = "https://bandcamp.com/artist_index"  # Set the url for all artists
html = urllib.request.urlopen(artist_index_url)  # Open the page
soup = BeautifulSoup(html, 'html.parser')  # Use beautifulSoup to access html

max_pages = get_max_pages(soup)  # Find the maximum # pages so you know how long to run the for loop
currentDT = datetime.datetime.now()
currentDT = currentDT.strftime("%Y-%m-%d_%H.%M.%S")

# Create/Open the CSV File you will be writing too
with open('Bandcamp_MASTER_' + str(currentDT) + '.csv', 'w', encoding='utf-8-sig') as csvfile:
    fieldnames = ['Artist', 'Album', 'Location', 'Song', 'Length', 'Tags', 'Date']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
    writer.writeheader()

    page_url = "?page=1"  # Initialize the 1st page as default

    # This will loop for every page available (1-->max_pages)
    for page in range(2650, max_pages):
        try:
            # Get the url for the page number and open it with BeautifulSoup
            page_url = page_url.replace(page_url[len(page_url) - 1], str(page))  # replace the previous page number
            page_html = artist_index_url + page_url  # create the new link for the next page to scan
            page_html = urllib.request.urlopen(page_html)  # open the next page
            soup = BeautifulSoup(page_html, 'html.parser')  # Pull the data from the next page

            # Get the artists on that page and append it to the complete artist list

            artist_class = soup.find('ul',
                                     {"class": "item_list"})  # Finds the html section that includes all artist links
            for a_link in artist_class.find_all('a', href=True):
                # print(a_link["href"]) # print the artist's url
                artist_url = a_link["href"]  # create variable for artist's url
                artist_html = urllib.request.urlopen(artist_url)  # open the artist's url
                print(artist_url)
                artist_soup = BeautifulSoup(artist_html, "html.parser")  # Pull the data from artist's url

                tab_check = get_tab_check(artist_soup)

                if tab_check == 0:
                    # If there are no tabs

                    post_check = artist_soup.find("ol", {
                        "data-edit-callback": "/music_reorder"})  # looks for a grid with multiple albums/tracks

                    if post_check is not None:
                        # Confirms that there are multiple posts
                        for n_posts in post_check.find_all("a", href=True):
                            # Enter the artist page and identify the artist's content
                            url = artist_url + n_posts["href"]
                            html = urllib.request.urlopen(url)
                            content_soup = BeautifulSoup(html, "html.parser")

                            if "track" in n_posts["href"]:
                                # The soup is a track
                                print(1)
                                # print(url)
                                data = get_single_track_data(content_soup)
                                write_to_csv(data)

                            elif "album" in n_posts["href"]:
                                # the soup is an album
                                print(2)
                                data = get_single_album_data(content_soup)
                                write_to_csv(data)
                            else:
                                # Skip to the next artist if it was neither
                                continue
                    else:
                        # There is one post, so open up their releases
                        print(3)
                        # if it's an album
                        url = artist_url + "/releases"
                        html = urllib.request.urlopen(url)
                        content_soup = BeautifulSoup(html, "html.parser")

                        # check if there are multiple tracks
                        post_check2 = artist_soup.find("table", {"class": "track_list track_table"})

                        if post_check2 is not None:
                            # There are multiple tracks
                            data = get_single_album_data(content_soup)
                            write_to_csv(data)
                        else:
                            data = get_single_track_data(content_soup)
                            write_to_csv(data)

                else:
                    # If there are tabs, skip and go to the next artist
                    continue
        except Exception:
                continue



https://martinparis.bandcamp.com
3
06/30/2011
https://whateverthatmeans.bandcamp.com
2
07/27/2016
2
12/19/2014
2
05/03/2014
2
05/03/2014
2
07/09/2011
2
04/03/2010
https://sexualreproductivesystems.bandcamp.com
3
02/05/2012
https://fairfaxalaska.bandcamp.com
3
06/29/2011
https://guerman.bandcamp.com
2
01/01/2016
2
09/02/2013
2
06/29/2011
https://katafuta.bandcamp.com
2
04/16/2014
2
03/29/2014
2
06/04/2013
https://magneticstripper.bandcamp.com
3
06/18/2009
https://simonkarolis.bandcamp.com
2
10/30/2013
1
06/02/2016
2
10/23/2014
2
12/25/2012
1
09/20/2012
https://nova-scotia.bandcamp.com
3
04/16/2009
https://dangrecords.bandcamp.com
2
05/15/2018
2
09/23/2017
2
08/26/2016
2
10/25/2016
2
07/21/2015
2
02/17/2012
2
03/16/2015
2
09/04/2012
2
01/01/2009
2
10/23/2012
2
03/24/2014
2
04/07/2012
2
05/01/2013
2
05/20/2012
2
10/15/2011
2
07/15/2011
2
06/19/2010
1
09/12/2011
1
09/06/2011
2
03/17/2012
https://ralphwaldo.bandcamp.com
3
12/12/1977
https://designbynv.bandcamp.com
2
05/03/2019
2
11/27/2018
