### Context

- scrape the most famous cocktails from the years 2001-2022 from the website LEOS's Bar with their respective ingredient list.
- can also contain seasonality because we have information about each month.
- http://bar.leo.org/hl/

In [116]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [2]:
def get_html(html):
    return BeautifulSoup(requests.get(html).text, "html.parser")

In [3]:
# all the URLS for the years have the format "http://bar.leo.org/hl/20XX"
# create a list with URLS from 2001 to 2022
# quick and dirty solution:
year_urls = ["http://bar.leo.org/hl/200" + str(i) for i in range(1,10)] + ["http://bar.leo.org/hl/20" + str(i) for i in range(10,23)]

In [None]:
tables = get_html("http://bar.leo.org/hl/2001")("table")

In [140]:
def table_to_df(table_html, year):
    
    # instantiate lists so we can append our information to them.
    place = []
    name = []
    visitors = []

    # the header is the first row of the table
    header = table_html("tr")[0].text
    
    month = header.split()[-1]

    table_contents = table_html("td")[3:-3]
    
    for i in range(0, len(table_contents), 3):
        # place is in the first, name in the second and the number of visitors in the third row.
        place.append(table_contents[i].text)
        name.append(table_contents[i+1].text)
        visitors.append(table_contents[i+2].text)
    
    # make a DataFrame out of the lists with additional information about the month and year.
    info_dict = {"name":name, "visitors":visitors, "place":place, "month":[month for i in range(len(place))], "year":[year for i in range(len(place))]}
    return pd.DataFrame(info_dict)

In [135]:
def scrape(year_urls):
    
    # initiate an empty dataframe to which we will concatenate the results
    df = pd.DataFrame(columns=["place", "name", "year", "month", "visitors"])
    
    # go through all the websites for the respective years
    for url in year_urls:
        year = re.split(r"/", url)[-1]
        
        # get all the relevant tables from the website
        tables = get_html(url)("table")[2:-2]
        
        # get the results from the tables and concatenate them
        for table in tables:
            df = pd.concat([df, table_to_df(table, year)])
    
    return df

In [137]:
data = scrape(year_urls)

260


In [146]:
data_mod = data.copy()
data_mod["month"] = data_mod["month"].where(data_mod["month"] != "PlatzDrinkBesucher", "total")

In [148]:
data_mod.to_csv("data.csv")