In [32]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime


In [69]:
EVENTS_MADEIRA_URL = 'https://eventsmadeira.com/en/event-listing/'
EVENTS_MADEIRA_ID = 1

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(EVENTS_MADEIRA_URL, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
divs = soup.find_all('div', class_='gt-event-style-3')


In [62]:
# get text from li elems and remove duplicities (mainly for gt-inner ul which could contain duplicities)
def get_data_text(result):
    list_items_text = [li.text for li in result.find_all('li')]
    if list_items_text and len(list_items_text) > 1:
        # remove duplicities
        return ', '.join(list(set(list_items_text)))
    return result.text

# fill data dict with date and time in datetime format
def fill_datetime(info_container, data):
    datetime_labels = [('start_date', 'start_time', 'gt-start-date'), ('end_date', 'end_time', 'gt-end-date')]
    for date_label, time_label, class_name in datetime_labels:
        datetime_scrapped = info_container.find('li', {'class': class_name})

        if datetime_scrapped is not None:
            datetime_ = datetime_scrapped.find('div', {'class': 'gt-inner'}).text
            datetime_obj = datetime.strptime(datetime_, '%d/%m/%Y %H:%M') if len(datetime_.split(' ')) > 0 else datetime.strptime(datetime_, '%d/%m/%Y')

            data[date_label].append(datetime_obj.date())
            data[time_label].append(datetime_obj.time())

        else:
            data[date_label].append('-')
            data[time_label].append('-')

# fill data dict with text from p tags from event detail page
def fill_description(page_content, data):
    description = page_content.find('div', {'class': 'gt-content'})
    data['description'].append(' '.join([p.text for p in description]).replace('\n', '') if description else '-')

# fill data dict with data from event detail page
def fill_event_data(url, data):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get parts of page
    title_bar = soup.find('div', {'class': 'gt-page-title-bar'})
    page_content = soup.find('div', {'class': 'gt-page-content'})
    info_container = soup.find('div', {'class': 'gt-content-detail-box'})

    if not info_container or not page_content:
        return None
    else: 
        fill_datetime(info_container, data)
        fill_description(page_content, data)

        title = title_bar.find('h1').text if title_bar else '-'
        address = info_container.find('li', {'class': 'gt-address'}).find('div', {'class': 'gt-inner'}).text if info_container.find('li', {'class': 'gt-address'}) else '-'
        location = get_data_text(info_container.find('li', {'class': 'gt-locations'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-locations'}) else '-'
        categories = get_data_text(info_container.find('li', {'class': 'gt-categories'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-categories'}) else '-'
        
        data['title'].append(title)
        data['address'].append(address)
        data['location'].append(location)
        data['categories'].append(categories)
        return 'success'


In [67]:
fields = ["title", "start_date", "end_date", "start_time", "end_time",  "location", "categories", 'address', 'description', 'sourceLink', 'sourceId']

data = {field: [] for field in fields}

for div in divs:
    url = div.find('div', class_='gt-title').find('a')['href']
    result = fill_event_data(url, data)
    if result:
        data['sourceLink'].append(url)
        data['sourceId'].append(EVENTS_MADEIRA_ID)


In [70]:
for key, value in data.items():
    print(f"Length of {key}: {len(value)}")


Length of title: 76
Length of start_date: 76
Length of end_date: 76
Length of start_time: 76
Length of end_time: 76
Length of location: 76
Length of categories: 76
Length of address: 76
Length of description: 76
Length of sourceLink: 76
Length of sourceId: 76


In [71]:
df = pd.DataFrame(data)
df


Unnamed: 0,title,start_date,end_date,start_time,end_time,location,categories,address,description,sourceLink,sourceId
0,Arrival of Santa Claus’s Allegoric Parade,-,-,-,-,Câmara de Lobos,Christmas,-,"In the 2019 Christmas season, Câmara de Lobos...",https://eventsmadeira.com/en/event/arrival-of-...,1
1,Rampa dos Barreiros,-,-,-,-,Funchal,"Antiques, Sports",-,Organized by Clube de Automóveis Clássicos da...,https://eventsmadeira.com/en/event/rampa-dos-b...,1
2,Nativity scenes around the island,-,-,-,-,"Funchal, Santana, Câmara de Lobos, Porto Moniz...",Christmas,-,A nativity scene is a spontaneous creation wh...,https://eventsmadeira.com/en/event/nativity-sc...,1
3,“Waterline” exhibition,2023-06-20,2028-06-20,10:00:00,18:00:00,Funchal,Culture,Museu de História Natural,The Natural History Museum of Funchal hosts t...,https://eventsmadeira.com/en/event/waterline-e...,1
4,Conventual sweets Route,2023-11-03,2023-11-24,10:00:00,13:00:00,Funchal,Gastronomy,Convemto Santa Clara,The Santa Clara Convent is the starting point...,https://eventsmadeira.com/en/event/conventual-...,1
...,...,...,...,...,...,...,...,...,...,...,...
71,Madeira Wine Lounge,2024-08-29,2024-09-15,18:00:00,23:00:00,Funchal,Gastronomy,-,"The Madeira Wine Lounge, in Praça do Povo, is...",https://eventsmadeira.com/en/event/madeira-win...,1
72,Concerts in the Vineyards,2024-09-07,2024-09-15,17:00:00,18:00:00,"São Vicente, Calheta, Ribeira Brava","Music, Gastronomy",-,A series of musical concerts taking place in ...,https://eventsmadeira.com/en/event/concerts-in...,1
73,Columbus Festival,2024-09-19,2024-09-22,15:00:00,23:00:00,Porto Santo,"Antiques, Official Poster",-,Columbus festival it’s just one of the many g...,https://eventsmadeira.com/en/event/columbus-fe...,1
74,World Tourism Day,2024-09-27,2024-09-27,09:00:00,00:00:00,"Funchal, Santana, Porto Moniz, Ribeira Brava, ...",Other,-,"Madeira, whilst tourist destination par exc...",https://eventsmadeira.com/en/event/world-touri...,1
