# Bandcamp Featured Album Web Scraping

This webscraping project aims to create a dataset that captures Bandcamps daily/weekly featured albums

Bandcamp | Album of the Day: https://daily.bandcamp.com/album-of-the-day

In [1]:
import re
import requests
import numpy  as np
import pandas as pd

## Extract all html pages containing 30 albums per page

In [2]:
page_num_list = [1]
current_page_num = 1
previous_page_num = 2
html_list = []
res = True

while res == True:
        url = 'https://daily.bandcamp.com/album-of-the-day?page=' + str(current_page_num)
        html = requests.get(url).text
        html_list.append(html)
        
        previous_page_num = int(re.search(r'(?<=page=)(\d{1,2})', html).group()) #1st output: 2
        page_num_list.append(previous_page_num)
        
        current_page_num += 1
        
        # Check if previous page number is greater than current page number; if True continue webscraping.
        res = all(i < j for i, j in zip(page_num_list, page_num_list[1:]))
        
print("Number of Pages: ",len(html_list))

Number of Pages:  45


## Extract url paths to all specific daily albums

In [3]:
album_path = []
album_url = []

for html in html_list:
    # Get first 30 album url paths from each page
    url_paths = re.findall(r'(/album-of-the-day/[a-z].+?)(?=")',html)

    # Remove duplicates   
    [album_path.append(x) for x in url_paths if x not in album_path]
    album_url = ['https://daily.bandcamp.com' + x for x in album_path]

# Convert to a Numpy Array
album_url_array = np.array(album_url)

print(album_url_array[0:5])

['https://daily.bandcamp.com/album-of-the-day/michael-hurley-the-time-of-the-foxgloves-review'
 'https://daily.bandcamp.com/album-of-the-day/ehiorobo-joltjacket-review'
 'https://daily.bandcamp.com/album-of-the-day/sky-h1-azure-review'
 'https://daily.bandcamp.com/album-of-the-day/home-front-think-of-the-lie-review'
 'https://daily.bandcamp.com/album-of-the-day/bola-sete-samba-in-seattle-live-at-the-penthouse-review']


## Extract HTML of each album specific page

In [4]:
lst = []
album_html_array = np.array(lst)

for i in album_url_array:
    html = requests.get(i).text
    album_html_array = np.append(album_html_array,html)
    
print("Number of html pages: ",len(album_html_array))
print("-"*27)
album_html_array[0][0:600]

Number of html pages:  1322
---------------------------


'    \n\n<!DOCTYPE html>\n<html>\n\n<head>\n    <title>\n    Michael Hurley, “The Time Of The Foxgloves” | Bandcamp Daily\n</title>\n\n    \n    \n\n\n\n    <meta name="description" content="\nHomespun and celebratory, the folk icon&amp;#39;s newest album is a summer porch party translated to sound.\n">\n\n    \n        \n\n\n\n\n    <meta property="og:title" content="Michael Hurley, “The Time Of The Foxgloves”">\n\n\n    <meta property="og:description" content="Homespun and celebratory, the folk icon&#39;s newest album is a summer porch party translated to sound.">\n\n\n    <meta property="og:url" content="https://daily.ban'

## Extracting data points 
* Article Title (Artist and Album)
* Date Article Published
* Record Label of Artist
* Record Label Location
* Author of Article
* Album Tags (Genre)

In [5]:
title_list = []
title_array = np.array(title_list)

for i, val in enumerate(album_html_array):
    try:
        title_array = np.append(title_array,re.search(r'(<title>\s.*?)\s+(<\/title>)',album_html_array[i]).group())
    except:
        title_array = np.append(title_array,"NA")

In [6]:
date_list = []
date_array = np.array(date_list)

for i, val in enumerate(album_html_array):
    try:
        date_array = np.append(date_array,re.search(r'(middot;\s).*\s(\s.*</article)',album_html_array[i]).group())
    except:
        date_array = np.append(date_array,"NA")

In [7]:
recordlabel_list = []
recordlabel_array = np.array(recordlabel_list)

for i, val in enumerate(album_html_array):
    try:
        recordlabel_array = np.append(recordlabel_array,re.search(r'(class="artist-name".*)(?=</a)',album_html_array[i]).group())
    except:
        recordlabel_array = np.append(recordlabel_array,"NA")

In [8]:
recordlabellocation_list = []
recordlabellocation_array = np.array(recordlabellocation_list)

for i, val in enumerate(album_html_array):
    try:
        recordlabellocation_array = np.append(recordlabellocation_array,re.search(r'(?<=location">)(\w.*)(?=</div)',album_html_array[i]).group())
    except:
        recordlabellocation_array = np.append(recordlabellocation_array,"NA")

In [9]:
author_list = []
author_array = np.array(author_list)

for i, val in enumerate(album_html_array):
    try:
        author_array = np.append(author_array,re.search(r'(contributors.*)(\w.+)(?=<)',album_html_array[i]).group())
    except:
        author_array = np.append(author_array,"NA")

In [10]:
tags_list = []
tags_array = np.array(tags_list)

for i, val in enumerate(album_html_array):
    try:
        tags_array = np.append(tags_array,re.search(r'article:tag.+(?=">)',album_html_array[i]).group())
    except:
        tags_array = np.append(tags_array,"NA")

In [12]:
data = np.array(list(zip(date_array,title_array,recordlabel_array,
                         recordlabellocation_array,author_array,tags_array)))

df = pd.DataFrame(data, columns = ["Date","Title","Record_Label","Record_Label_Loc","Author","Tags"])
df.head(2)

Unnamed: 0,Date,Title,Record_Label,Record_Label_Loc,Author,Tags
0,"middot;\n December 17, 2021\n </article","<title>\n Michael Hurley, “The Time Of The ...","class=""artist-name""><a href=""https://michaelhu...","Astoria, Oregon","contributors/marc-masters"">Marc Masters","article:tag"" content=""Folk"
1,"middot;\n December 16, 2021\n </article","<title>\n Ehiorobo, “Joltjacket” | Bandcamp...","class=""artist-name""><a href=""https://ehiorobo....",New Jersey,"contributors/ann-derrick-gaillot"">Ann-Derrick ...","article:tag"" content=""Alternative"


In [13]:
# Clean Date Series
df["Date"] = df['Date'].replace(r'middot;\n', '', regex = True).str.strip()
df["Date"] = df['Date'].replace(r'(\n .*)', '', regex = True)

In [14]:
# Clean Title Series
df['Title'] = df['Title'].replace(r'(<title>)\n', '', regex = True).str.strip()
df["Title"] = df['Title'].replace(r'( \| Bandcamp Daily\n</title>)', '', regex = True)
df['Title'] = df['Title'].replace(r'(amp;)', '', regex = True)
df['Title'] = df['Title'].replace(r'(Album of the Day: )', '', regex = True)
df['Title'] = df['Title'].replace(r'(Album of the Day, )', '', regex = True)
df['Title'] = df['Title'].replace(r'(Album of the Week: )', '', regex = True)

In [15]:
# Clean Record Label Series
df['Record_Label'] = df['Record_Label'].str.extract(r'(true.*)')
df['Record_Label'] = df['Record_Label'].replace(r'(true">)', '', regex = True)

In [16]:
# Clean Author Series
df['Author'] = df['Author'].str.extract(r'(>.*)')
df['Author'] = df['Author'].replace(r'(>)', '', regex = True)

In [17]:
# Clean Tags Series
df['Tags'] = df['Tags'].str.extract(r'(=.*)')
df['Tags'] = df['Tags'].replace(r"(')", '', regex = True)
df['Tags'] = df['Tags'].replace(r'(")', '', regex = True)
df['Tags'] = df['Tags'].replace(r'(=)', '', regex = True)

In [18]:
# Split Article Title into Artist and Album series
df[['Artist', 'Album']] = df['Title'].str.split(',',n=1, expand=True)

In [19]:
# Clean Album series values
for i, val in enumerate(df['Album'].values):
    if val == None:
        df.iloc[i,7] = df.iloc[i,6]

In [20]:
# If Artist = Album, then Artist should take Record Label Value
for i, val in enumerate(df['Artist'].values):
    if df.iloc[i,6] == df.iloc[i,7]:
        df.iloc[i,6] = df.iloc[i,2]
# If Artist = Album then Artist = Record Label

In [21]:
# If Record Label = Artist, they are unsigned (Independent Artist)
for i, val in enumerate(df['Artist'].values):
    if df.iloc[i,6] == df.iloc[i,2]:
        df.iloc[i,2] = "Independent Artist"

In [22]:
# Clean Album Series
df['Album'] = df['Album'].replace(r'“', '', regex = True)
df['Album'] = df['Album'].replace(r'”', '', regex = True)

In [23]:
# Row is ALL NaN
df.drop(index=586, inplace=True)

df.isnull().any()

Date                False
Title               False
Record_Label         True
Record_Label_Loc    False
Author               True
Tags                 True
Artist               True
Album               False
dtype: bool

In [24]:
# Reorder DateFrame
df = df[["Date","Album","Artist","Tags","Record_Label","Record_Label_Loc","Author"]]

In [25]:
for col in df.columns:
    df[col] = df[col].str.strip()

# Clean Data Set

In [26]:
df.head()

Unnamed: 0,Date,Album,Artist,Tags,Record_Label,Record_Label_Loc,Author
0,"December 17, 2021",The Time Of The Foxgloves,Michael Hurley,Folk,Independent Artist,"Astoria, Oregon",Marc Masters
1,"December 16, 2021",Joltjacket,Ehiorobo,Alternative,Independent Artist,New Jersey,Ann-Derrick Gaillot
2,"December 15, 2021",Azure,SKY H1,Ambient,Independent Artist,"Brussels, Belgium",Joe Muggs
3,"December 14, 2021",Think of the Lie,Home Front,Punk,LA VIDA ES UN MUS DISCOS,"London, UK",Jes Skolnik
4,"December 13, 2021",Samba in Seattle: Live at the Penthouse 1966​-...,Bola Sete,World,Tompkins Square,"San Francisco, California",Andy Beta


In [27]:
df.groupby("Tags")["Album"].count().sort_values(ascending=False)

Tags
Alternative     199
Electronic      171
Jazz             92
Hip-Hop/Rap      85
Experimental     82
Rock             68
World            62
Metal            58
Pop              48
R&amp;B/Soul     47
Punk             47
Ambient          38
Folk             20
Funk             17
Classical         9
Latin             7
Reggae            7
Country           4
Soundtrack        3
Devotional        2
Comedy            1
Blues             1
Acoustic          1
Name: Album, dtype: int64