TOOLS

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import re
#Pandas and Re are imported in case you would like to visualise the data as a test

MAKE CSV

In [2]:
#Here you create a CSV file to store the data that you scrape later.
#You also have the chance to rename the CSV based on what dataset you are scraping.
#It is also possible to change the field names, but it is important to remember that they must be changed throughout the Notebook
with open('Top100Benelux.csv', 'a', encoding= "utf-8") as csvfile:
        fieldnames = ['name', 'country', 'genres', 'lineup_2017', 'lineup_2018', 'lineup_2019', 'other_lineups']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

ADD URL

In [3]:
#Here you add the URL that you will be scraping. This cell also creates the page soup.
URL = 'https://www.electronic-festivals.com/festivals/top100-ranking/benelux'

page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

GET TOP100

In [4]:
#Because of how the website was structured it was difficult to tell the code to visit each link seperately.
#The HTML however named each page using the extension needed for the various pages in the Top100.
#Therefore this cell collects all the extensions and will later on stick them onto the base URL to create the pages we are aiming to visit.
addons = []
for link in soup.find_all('a', attrs= {'href': re.compile("/event")}):
    x = ''
    x = link.get('href')
    addons.append(x)

print(addons)

['/event/amsterdam-dance-event', '/event/tomorrowland', '/event/tomorrowland-weekend-2', '/event/mysteryland', '/event/defqon1', '/event/awakenings-festival', '/event/intents-festival', '/event/decibel-outdoor-festival', '/event/the-qontinent', '/event/extrema-outdoor-belgium', '/event/emporium-festival', '/event/rebirth-festival', '/event/supersized-kingsday', '/event/amsterdam-open-air', '/event/dgtl-festival', '/event/dominator-the-hardcore-festival', '/event/masters-of-hardcore-nl', '/event/sunrise-festival-belgium', '/event/loveland-festival', '/event/we-are-electric', '/event/dream-village', '/event/freshtival', '/event/so-whappy', '/event/7th-sunday-festival', '/event/airforce-festival', '/event/wish-outdoor-netherlands', '/event/dekmantel-festival', '/event/welcome-to-the-future', '/event/dreamfields', '/event/dourfestival', '/event/free-festival', '/event/phoenix-the-hardest-outdoor-festival', '/event/daydream-festival-netherlands', '/event/pukkelpop', '/event/reverze', '/even

FUNCTION FOR COLLECTING INFO

In [5]:
#This cell is the main function used to scrape the Top100.
#Most parts are structured very similar to one another but with different objectives.
#Several of the steps contain a print() command. This was left in to make sure the information is being collected, but can be commented out if not necessary.
#URL and CSVFILE must be in ""
def collect_info(url, csvfile):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    
    #NAME
    name = ''
    festival_name = soup.find_all('h1')
    for festival_name in festival_name:
        name = festival_name.text.strip()
    print(name)
    
    #COUNTRY
    country = ''
    festival_country = soup.find_all('div', {'class': 'country-name'})
    for festival_country in festival_country:
        country = festival_country.text.strip() 
    print(country)
    
    #GENRES
    #The Genres were presented in a rather weird way because they grouped several together as similar genres. Therefor the code splits the ' // '
    #That they used to seperate the Genres. Giving us a clearer list of Genres.
    genres = ''
    festival_genres = soup.find_all('div', {'class': 'genrelist'})
    for festival_genres in festival_genres:
        new_fgenres = festival_genres.text.strip()
        new_fgenres = new_fgenres.split(' // ')
        genres = new_fgenres
    print(genres)
    
    #LINEUP
    dates = soup.find_all('div', {'class': 'field-date'})
    data = []

    #The HTML for the website grouped the date of a festival with the artists resulting in a significant amount of complications.
    #This is also where most of the time coding was spent. Finally this section managed to seperate the date and the artists into a dictionary that took date as a key,
    #And the list of artists as the corresponding data.
    #Afterwards the data is divided across a series of lists based on the year the festival took place.
    for date in dates:
        item = {"date": "", "artists": []}
        for el in date.parent:
            if(el.name == "div"):
                item['date'] += el.get_text()

            elif(el.name == "a"):
                item['artists'].append(el.get_text())
        data.append(item)
    
    other_lineups = []
    lineup2017 = []
    lineup2018 = []
    lineup2019 = []

    for dic in data:
        if '2017' in dic['date']:
            lineup2017 = dic['artists']
        elif '2018' in dic['date']:
            lineup2018 = dic['artists']
        elif '2019' in dic['date']:
            lineup2019 = dic['artists']
        else:
            other_lineups += dic['artists']
        
    
    #WRITE TO CSV
    #The Data is then added to a csv file.
    #It is important to change the names of the fields in case they are changed earlier on.
    with open(csvfile, 'a', encoding= "utf-8") as csvfile:
        fieldnames = ['name', 'country', 'genres', 'lineup_2017', 'lineup_2018', 'lineup_2019', 'other_lineups']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow({'name': name, 'country': country, 'genres': genres, 'lineup_2017': lineup2017, 'lineup_2018': lineup2018, 'lineup_2019': lineup2019, 'other_lineups': other_lineups})
    
    return

SCRAPE

In [6]:
#This cell uses the function and information from the previous cells to scrape a top100 list on electronic-festivals.com.
#The resulting information will be stored in the CSV file you created earlier.
for add in addons:
    link = 'https://www.electronic-festivals.com' + add
    URL = link
    csvfile = 'Top100Benelux.csv'
    collect_info(URL, csvfile)

Amsterdam Dance Event
Netherlands
['Electro', 'POP', 'Techno', 'Tech House', 'House', 'Deep House', 'Dubstep', 'Trap', 'Bass', 'Big Room', 'EDM', 'Trance', 'Drum & Bass', 'Chillout', 'Down Tempo', 'Hardcore', 'Hardstyle', 'Goa', 'Psytrance', 'Experimental']
Tomorrowland
Belgium
['Techno', 'Tech House', 'Big Room', 'EDM', 'Trance', 'House', 'Deep House', 'Drum & Bass', 'Dubstep', 'Trap', 'Bass', 'Hardcore', 'Hardstyle', 'Goa', 'Psytrance']
Tomorrowland - Weekend 2
Belgium
['Techno', 'Tech House', 'Big Room', 'EDM', 'Trance', 'House', 'Deep House', 'Drum & Bass', 'Dubstep', 'Trap', 'Bass', 'Hardcore', 'Hardstyle', 'Goa', 'Psytrance']
Mysteryland
Netherlands
['Big Room', 'EDM', 'Techno', 'Tech House', 'Hardcore', 'Hardstyle', 'Dubstep', 'Trap', 'Bass', 'House', 'Deep House', 'Chillout', 'Down Tempo', 'Drum & Bass', 'Trance']
Defqon1
Netherlands
['Hardcore', 'Hardstyle']
Awakenings Festival
Netherlands
['Techno', 'Tech House', 'House', 'Deep House']
Intents Festival
Netherlands
['Hardcore'