<a href="https://colab.research.google.com/github/MikolajKasprzyk/metal_archives_statistics/blob/main/metal_archives_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt update
!apt install chromium-chromedriver
!pip install selenium
!apt-get update
!apt install firefox-geckodriver
!cp /usr/lib/geckodriver /usr/bin
!cp /usr/lib/firefox /usr/bin

import string
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium import webdriver
import pandas as pd
import numpy as np
import re
import datetime
import pickle
import logging
import traceback

In [None]:
# SET PATH TO CURRENT FOLDER
path = '/content/drive/My Drive/Colab Notebooks/metal_archives/'

# SCRAPING LIST OF BAND'S NAMES AND URLS

# setting up firefox webdriver for Google Colab
binary = '/usr/bin/firefox'
options = webdriver.FirefoxOptions()
options.binary = binary
options.add_argument('start-maximized')
options.add_argument('--headless')
driver = webdriver.Firefox(options=options, 
                           executable_path='/usr/bin/geckodriver')

# Because scraping takes time and occasional errors it is good to save to file.
# Read file of dataframe with band's names and urls if exists:
try:
    bands_urls = pd.read_pickle(path + 'bands_urls.pkl')

# If dataframe with bands names and urls does not exist, start scraping.
# On Metal Archives there is alphabetical list of bands. Url strats with 
# what you see below, and ends with A-Z (or other character) letter.
except FileNotFoundError as e:
    logging.error(f"File not found: {e.filename}")
    
    # beginning of the url with albhabetical list - website specific
    list_url = "https://www.metal-archives.com/lists/"
    # the url ends with letter A-Z depending on first letter of the band's name
    # which is specific for this website
    alphabet = list(string.ascii_uppercase)
    # besides letters there are urls for bands starting with other characters
    alphabet.extend(["NBR", "~"])
    # create dataframe for urls for every band in the alphabetical list
    # with this list we can scrap data piece by piece, test it easier etc
    bands_urls = pd.DataFrame(columns=['band_name', 'band_url'])

    # SCRAPING DATAFRAME WITH NAMES AND URLS OF BANDS
    # loop through all urls alphabetical, create list of selinium objects with
    # band's names and urls 
    for letter in alphabet:
        web = list_url + letter
        driver.get(web)  # open website in the browser
        time.sleep(3)  # load the website's content takes some time

        while True:
            # searching the html for the tag in braces (band name and url)
            urls_selenium = driver.find_elements(By.XPATH, "//tbody/tr/td/a")
            # insert found data into dataframe
            for band in urls_selenium:
                bands_urls.loc[len(bands_urls)] = [band.text, 
                                                   band.get_attribute("href")]
            try:
                # looking if there is a button to click on the next site
                # as every alphabet letter has several pages with bands listed
                next_site = driver.find_element(
                            By.XPATH, "//a[@class='next paginate_button']")
            except Exception:
                # if there is no button (no more table for the letter), break
                break
            # if there is a next site button, click it
            next_site.click()
            time.sleep(3)
    
    # write dataframe to csv to load it later - it took long time to scrape
    bands_urls.to_pickle(path + 'bands_urls.pkl')


In [None]:
# SCRAPING INFO ABOUT BANDS USING DATAFRAME WITH URLS CREATED ABOVE

bands_urls.drop_duplicates(inplace=True)

# whole thing is in while loop that breaks when urls ends 
# because it is sometimes necassary to restart webdriver

while True:
    print('Restarting webdriver...')
    # set errors counter to 0, it counts consecutive errors to determine if 
    # problem is with single url or something more
    error_count = 0
    # setting up firefox webdriver for Google Colab
    binary = '/usr/bin/firefox'
    options = webdriver.FirefoxOptions()
    options.binary = binary
    options.add_argument('start-maximized')
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options, 
                               executable_path='/usr/bin/geckodriver')

    # read saved file if exists, its used to scrape info piece by piece
    try:
        bands_info_df = pd.read_pickle(path + 'bands_info_df.pkl')
        # last band info can be corrupted because of breaking scraping loop
        bands_info_df.drop(bands_info_df.tail(1).index, inplace=True)
    except FileNotFoundError as e:
        logging.error(f"File not found: {e.filename}")
        # if there is no dataframe saved, create new empty one with column to 
        # compare with when making missing_info_df
        bands_info_df = pd.DataFrame(columns=['band_url'])

    # SCRAPING BAND INFO
    try:
        # iterating on dataframe bands_missing_info which is made by
        # subtracting urls of bands already in bands_info_df
        # dataframe (read from file) from bands_urls dataframe
        bands_missing_info = bands_urls[~bands_urls['band_url']
                                        .isin(bands_info_df['band_url'])]
        # Create df with two columns as apparently you can not
        # assign list to df cell creating column at the same time
        bands_info_scraped = pd.DataFrame(columns=['artists', 'discog'])


        for index, row in bands_missing_info.iterrows():

            band_name = row['band_name'] # get name and insert into new df
            band_url = row['band_url'] # get name and insert into new df

            bands_info_scraped.at[index, 'band_name'] = band_name
            bands_info_scraped.at[index, 'band_url'] = band_url
            try:
                # if error is not single url break loop and restart driver
                if error_count > 1:
                    break
                # if error was single url reset counter
                error_count = 0
                
                # INFO SCRAPING
                driver.get(band_url)
                # needs to load page, also M-A asks 3s delay for scraping
                time.sleep(3)
                # press the button to show table with discography
                button = driver.find_element(By.ID, "ui-id-6")
                button.click()
                # get html content
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, "lxml")

                # find the tag with all the data
                band_info = soup.find(id="band_info")
                # create a list of all the data
                band_stats = band_info.find_all("dd")

                # assign the stripped data strings dto variables
                country = band_stats[0].text.strip()
                status = band_stats[2].text.strip()
                formed_in = band_stats[3].text.strip()
                genre = band_stats[4].text.strip()
                lyrical_themes = band_stats[5].text.strip()
                current_label = band_stats[6].text.strip()
                years_active = band_stats[7].text.strip()

                # assign to dataframe
                bands_info_scraped.at[index, 'country'] = country
                bands_info_scraped.at[index, 'status'] = status
                bands_info_scraped.at[index, 'formed_in'] = formed_in
                bands_info_scraped.at[index, 'genre'] = genre
                bands_info_scraped.at[index, 'lyrical_themes'] = lyrical_themes
                bands_info_scraped.at[index, 'current_label'] = current_label
                bands_info_scraped.at[index, 'years_active'] = years_active
                
                # ARTISTS
                artists = soup.find(id="band_tab_members_current")
                artist_tags = artists.find_all('a', class_='bold')
                artist_list = [artist.text for artist in artist_tags]
                # assign to df
                bands_info_scraped.at[index, 'artists'] = artist_list

                # ALBUMS
                # find discography table
                table = soup.find('table', class_='display discog')
                rows = table.find_all('tr') # find all rows of table
                
                albums = []
                for row in rows:
                    cols = row.find_all('td')
                    cols = [col.text.strip() for col in cols]
                    albums.append(cols)
                # assign do df
                bands_info_scraped.at[index, 'discog'] = albums

                # save to file every 10 bands or when finished
                if len(bands_info_scraped) % 10 == 0 \
                        or index == bands_missing_info.index[-1]:
                    # concat scraped info to main df
                    bands_info_df = pd.concat(
                                    [bands_info_df, bands_info_scraped])
                    bands_info_df.to_pickle(path + 'bands_info_df.pkl')
                    # clear bands_info_scraped for next iteration
                    bands_info_scraped = pd.DataFrame(
                                         columns=['artists', 'discog'])
                    
            
            except  Exception as e:
                logging.error(traceback.format_exc())
                # here is error counter to check if single url is problematic or
                # there are some bigger issues - if so breaking for loop and 
                # restarting webdriver (begining of while loop)
                error_count += 1
                print('Problematic url:', str(band_url))
                print('Index:  ', index)
                continue
            # break when finished
            if index == bands_missing_info.index[-1]:
                break
    
    except  Exception as e:
        logging.error(traceback.format_exc())
        continue
    # break when finished
    if index == bands_missing_info.index[-1]:
        break