In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta
from dateutil.parser import parse
import locale


In [2]:
COMMA = ','
TODAY = 'Today'
SPACE = ' '
DASH = ' - '

In [29]:
def parse_weekday(date):
    weekday = date.split(' ')[0]
    time = date.split(' ')[2]
    hour = int(time.split(':')[0])
    minute = int(time.split(':')[1])

    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    target_day = days.index(weekday)

    now = datetime.now()

    days_to_next_target_day = (target_day - now.weekday() + 7) % 7
    next_target_day = now + timedelta(days=days_to_next_target_day)

    next_target_day_at_specific_time = next_target_day.replace(hour=hour, minute=minute, second=0, microsecond=0)

    return next_target_day_at_specific_time


def parse_date(date): 
    if COMMA in date: 
        date_object = datetime.strptime(date, "%a, %b %d, %I:%M %p").replace(year=datetime.now().year)
    else: 
        if TODAY in date: 
            hour = int(date.split(' ')[2].split(':')[0])
            minute = int(date.split(' ')[2].split(':')[1])
            date_object = datetime.now().replace(hour=hour, minute=minute, second=0, microsecond=0)
        else:
            date_object = parse_weekday(date)
            
    return date_object

def parse_address(address):
    location = address.find('p', class_='location-info__address-text')
    event_location = location.text
    # remove button and location elements from div with address
    address.find('div', class_='map-button-toggle').decompose()
    location.decompose()
    event_address = address.text.strip()
    return event_address, event_location

def parse_event_date(event_date):
    locale.setlocale(locale.LC_TIME, 'en_US.UTF-8')
    if ' · ' in event_date:
        # format Saturday, August 3 · 6 - 11:59pm WEST
        if COMMA in event_date.split(' · ')[0]:
            event_month = event_date.split(' · ')[0].split(SPACE)[1]
            event_day = event_date.split(' · ')[0].split(SPACE)[2]
            event_month_end = event_date.split(' · ')[0].split(SPACE)[1]
            event_day_end = event_date.split(' · ')[0].split(SPACE)[2]
            event_start_time = event_date.split(' · ')[1].split(' - ')[0]
            event_end_time = event_date.split(' · ')[1].split(' - ')[1].split(SPACE)[0]
        # format August 20 · 7pm - August 21 · 2am WEST
        else:
            event_month = event_date.split(' · ')[0].split(SPACE)[0]
            event_day = event_date.split(' · ')[0].split(SPACE)[1]
            event_month_end = event_date.split(' · ')[1].split(DASH)[1].split(SPACE)[0]
            event_day_end = event_date.split(' · ')[1].split(DASH)[1].split(SPACE)[1]
            event_start_time = event_date.split(' · ')[1].split(' - ')[0]
            event_end_time = event_date.split(' · ')[2].split(SPACE)[0]

        time_postfix = None

        # print(event_month, event_day, event_start_time)
        # print(event_month_end, event_day_end, event_end_time)


        if 'am' in event_start_time or 'pm' in event_start_time:
            time_postfix = event_start_time[-2:]
            event_start_time = event_start_time[:-2]
        
        if 'am' in event_end_time or 'pm' in event_end_time:
            time_postfix = event_end_time[-2:]
            event_end_time = event_end_time[:-2]
        
        dt_string = event_month + SPACE + event_day + SPACE + str(datetime.now().year) + SPACE + event_start_time + SPACE + (time_postfix.upper() if time_postfix else "PM")
        dt_string_end = event_month_end + SPACE + event_day_end + SPACE + str(datetime.now().year) + SPACE + event_end_time + SPACE + (time_postfix.upper() if time_postfix else "PM")
        if ":" in event_start_time:
            date_object = datetime.strptime(dt_string, '%B %d %Y %I:%M %p')
        else:
            date_object = datetime.strptime(dt_string, '%B %d %Y %I %p')
        
        if ":" in event_end_time:
            date_object_end = datetime.strptime(dt_string_end, '%B %d %Y %I:%M %p')
        else:
            date_object_end = datetime.strptime(dt_string_end, '%B %d %Y %I %p')
    else:
        try:
            print(event_date.split(' - ')[0], event_date.split(' - ')[1])
            date_object = datetime.strptime(event_date.split(' - ')[0], "%a, %d %b %Y %H:%M")
            print(len(event_date.split(' - ')[1]) < 12)
            if len(event_date.split(' - ')[1]) < 12: 
                time = event_date.split(' - ')[1].split(SPACE)[0] + ':00'
                print(date_object)
                date_object = date_object.replace(hour=int(time.split(':')[0]), minute=int(time.split(':')[1]), second=0, microsecond=0)
                print(date_object)
                date_object_end = date_object
            else:
                date_object_end = datetime.strptime(event_date.split(' - ')[1][:-5], "%a, %d %b %Y %H:%M") if 'WEST' in event_date else datetime.strptime(event_date.split(' - ')[1][:-4], "%a, %d %b %Y %H:%M") if 'WET' in event_date or "EDT" in event_date else datetime.strptime(event_date.split(' - ')[1], "%a, %d %b %Y %H:%M")
        except:
            locale.setlocale(locale.LC_TIME, 'pt_PT.UTF-8')
            date_format = '%a, %d de %b de %Y %H:%M'
            if DASH in event_date:
                date_object = datetime.strptime(event_date.split(' - ')[0], date_format)
                date_object_end = datetime.strptime(event_date.split(' - ')[1][:-5], date_format) if 'WEST' in event_date else datetime.strptime(event_date.split(' - ')[1], date_format)
            else:
                date_object = datetime.strptime(event_date, date_format)
                date_object_end = date_object
            

    # event_end_time = event_date.split(' · ')[1].split(' - ')[1].split(SPACE)[0]
    print(event_date)
    print(date_object, date_object_end)
    return date_object, date_object_end

# print(parse_event_date('Saturday, July 27 · 3 - 11:30pm WEST'))
# print(parse_event_date('August 1 · 7pm - August 2 · 1am WEST'))
# print(parse_event_date('Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET'))
# print(parse_event_date('Thursday, July 18 · 6:30 - 9:30pm EDT'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00 - dom, 11 de ago de 2024 03:00 WEST'))
# print(parse_date('Sat, Jul 27, 3:00 PM'))
# print(parse_date('Thursday at 4:00 PM'))

# print(parse_event_date('August 1 · 7pm - August 2 · 1am WEST'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00 - dom, 11 de ago de 2024 03:00 WEST'))
# print(parse_event_date('Saturday, August 3 · 6 - 11:59pm WEST'))
# print(parse_event_date('August 20 · 7pm - August 21 · 2am WEST'))
# print(parse_event_date('Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET'))
# print(parse_event_date('Thursday, August 15 · 6:30 - 9:30pm EDT'))
# print(parse_event_date('Tue, 6 Aug 2024 12:30 - 13:30 WEST'))


In [4]:
# todo: think about what to do with pagination
# todo: think about do we need EventCardUrgencySignal__label - going fast, sold out, almost full etc
EVENTBRITE_ID = 2 
EVENTBRITE_URL = 'https://www.eventbrite.com/d/portugal--ilha-da-madeira--85687345/all-events/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(EVENTBRITE_URL, headers=headers)
time.sleep(5)

soup = BeautifulSoup(response.text, 'html.parser')
events = soup.find_all('div', class_='discover-search-desktop-card discover-search-desktop-card--hiddeable')

events


[<div class="discover-search-desktop-card discover-search-desktop-card--hiddeable"><section class="DiscoverHorizontalEventCard-module__cardWrapper___2_FKN"><div class="Container_root__4i85v NestedActionContainer_root__1jtfr event-card event-card__horizontal horizontal-event-card__action-visibility" style="--ContainerBgColor:#fff;--ContainerBorderRadius:16px;--ContainerElevationFocusWithin:0px 2px 8px rgba(30, 10, 60, 0.06), 0px 4px 12px rgba(30, 10, 60, 0.08);--ContainerElevationHover:0px 2px 8px rgba(30, 10, 60, 0.06), 0px 4px 12px rgba(30, 10, 60, 0.08);--ContainerPadding:16px"><div data-testid="event-card-tracking-layer" style="position:absolute;top:0px;left:0px;height:100%;pointer-events:none;width:100%"></div><section class="horizontal-event-card__column" style="--HorizontalCardColumnMarginRight:24px"><a aria-label="View BEAT BREAKER" class="event-card-link" data-event-category="music" data-event-id="952196313807" data-event-location="Funchal, Madeira" data-event-paid-status="paid

In [30]:
fields = ["title", "start_date", "end_date", "location", 'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']

data = {field: [] for field in fields}
event_links = []

for event in events: 
    event_card = event.find('a', class_='event-card-link')
    event_link = event_card.get('href')

    event_links.append(event_link)

In [31]:
import json

def get_price(soup):
    # Find all script tags and then find the one containing '__SERVER_DATA__'
    script_tags = soup.find_all('script')
    script_tag = None
    for tag in script_tags:
        if '__SERVER_DATA__' in tag.text:
            script_tag = tag
            break
    if script_tag:
        # Extract the JSON string
        json_str = script_tag.string.split('=', 1)[1].strip()
        # Load the JSON string into a Python dictionary
        data = json.loads(json_str[:-1])
        is_free = data['event_listing_response']['tickets']['ticketClasses'][0]['characteristics']['isFree']
        if is_free:
            minimum_ticket_price_rounded = 'Free'
        else:
            # Access the minimumTicketPriceRounded value
            minimum_ticket_price_rounded = data['event_listing_response']['tickets']['ticketClasses'][0]['cost']['display']
    else:
        minimum_ticket_price_rounded = None
    
    return minimum_ticket_price_rounded

def get_category(soup):
    # Find all script tags and then find the one containing '__SERVER_DATA__'
    script_tags = soup.find_all('script')
    script_tag = None
    for tag in script_tags:
        if '__SERVER_DATA__' in tag.text:
            script_tag = tag
            break
    if script_tag:
        # Extract the JSON string
        json_str = script_tag.string.split('=', 1)[1].strip()
        # Load the JSON string into a Python dictionary
        data = json.loads(json_str[:-1])
        category = data['event']['category']
    else:
        category = 'Other'
    return category


In [33]:
fields = ["title", "start_date", "end_date", "location",  'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']
data = {field: [] for field in fields}

for link in event_links:
    print(link)
    res = requests.get(link, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    event_title = soup.find('h1', class_='event-title').text
    event_price = soup.find('div', class_='conversion-bar__panel-info').text if soup.find('div', class_='conversion-bar__panel-info') else get_price(soup)

    address = soup.find('div', class_='location-info__address')
    event_address, event_location = parse_address(address)

    date = soup.find('span', class_='date-info__full-datetime').text
    print(date)
    event_start_date, event_end_date = parse_event_date(date)

    event_description = soup.find('div', class_='event-description__content')
    event_short_description = None

    event_image_link = soup.find('div', class_='event-hero').find('picture').find('img').get('src')

    event_organizer = soup.find('a', class_='descriptive-organizer-info-mobile__name-link').text if soup.find('a', class_='descriptive-organizer-info-mobile__name-link') else None
    event_source_link = link
    event_source_id = EVENTBRITE_ID
    
    event_last_inserted_at = datetime.now()
    event_last_updated_at = datetime.now()

    event_categories = get_category(soup)

    data['title'].append(event_title)
    data['address'].append(event_address)
    data['location'].append(event_location)
    data['price'].append(event_price)
    data['start_date'].append(event_start_date)
    data['organizer'].append(event_organizer)

    data['end_date'].append(event_end_date)
    data['description'].append(event_description)
    data['short_description'].append(event_short_description)
    # data['categories'].append(event_categories)
    data['image_link'].append(event_image_link)

    data['source_link'].append(event_source_link)
    data['source_id'].append(event_source_id)
    data['last_inserted_at'].append(event_last_inserted_at)
    data['last_updated_at'].append(event_last_updated_at)

    # print(event_title, event_price, event_address, event_location, event_start_date, event_image_link, event_organizer, event_source_link, event_source_id, event_last_inserted_at, event_last_updated_at)
    print(event_description)


https://www.eventbrite.pt/e/beat-breaker-tickets-952196313807?aff=ebdssbdestsearch
Saturday, August 3 · 6 - 11:59pm WEST
Saturday, August 3 · 6 - 11:59pm WEST
2024-08-03 18:00:00 2024-08-03 23:59:00
<div class="has-user-generated-content event-description__content event-description__content--expanded" id="event-description"><div class="eds-l-mar-vert-6 eds-l-sm-mar-vert-4 eds-text-bm structured-content-rich-text"><div class="eds-text--left"><p>Three House is partenering with 100horas and presenting a mini summer session on the 3rd of August between 6pm and 01 am.</p><p>Join us for an unforgettable evening at Three House Rooftop, where the vibrant energy of electronic music meets the breathtaking beauty of the 360º views. Grab a cocktail and enjoy as the regional and international DJs spin electrifying tracks that will keep you moving all night long.</p><p><br/></p><p>Artists will be announced soon.</p></div></div><div class="" data-testid="image-content"><div class="eds-l-mar-vert-6 ed

In [34]:
for key, value in data.items():
    print(f"Length of {key}: {len(value)}")

Length of title: 10
Length of start_date: 10
Length of end_date: 10
Length of location: 10
Length of address: 10
Length of description: 10
Length of short_description: 10
Length of source_link: 10
Length of source_id: 10
Length of image_link: 10
Length of organizer: 10
Length of price: 10
Length of last_inserted_at: 10
Length of last_updated_at: 10


In [35]:
df = pd.DataFrame(data)

df

Unnamed: 0,title,start_date,end_date,location,address,description,short_description,source_link,source_id,image_link,organizer,price,last_inserted_at,last_updated_at
0,BEAT BREAKER,2024-08-03 18:00:00,2024-08-03 23:59:00,THREE HOUSE HOTEL,2 Rua Brigadeiro Oudinot 9060-209 Funchal Port...,[[[<p>Three House is partenering with 100horas...,,https://www.eventbrite.pt/e/beat-breaker-ticke...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Three House Hotel,€8 – €15,2024-08-03 08:18:47.647312,2024-08-03 08:18:47.647313
1,Summer Night Dance - Летний вечер & Танцы,2024-08-10 22:00:00,2024-08-11 03:00:00,NOMADE Restaurante,108 Rua Imperatriz Dona Amelia 9000-018 Funcha...,[[[<p>Summer Night Dance - Летний вечер &amp; ...,,https://www.eventbrite.de/e/bilhetes-summer-ni...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Code coach,10.00 EUR,2024-08-03 08:18:49.037089,2024-08-03 08:18:49.037092
2,ILLUZION DANSU - Jin Garden (Funchal),2024-08-20 07:00:00,2024-08-21 02:00:00,Jin Garden,3 Avenida Sá Carneiro #2º 9000-017 Funchal Por...,[[[<p>Get ready for ILLUZION TECHNO on August ...,,https://www.eventbrite.pt/e/illuzion-dansu-jin...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,ILLUZION TECHNO,From €11.89,2024-08-03 08:18:50.363918,2024-08-03 08:18:50.363919
3,´MADEIRADiG 2024,2024-11-29 21:00:00,2024-12-02 23:30:00,MUDAS - Museu de Arte Contemporânea da Madeira,Estrada Simão Gonçalves Câmara 37 Ilha da Made...,[[[<p><strong><strong>MADEIRADiG is back for 2...,,https://www.eventbrite.co.uk/e/madeiradig-2024...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,MADEIRADiG - Festival for Digital Music & Art,€20,2024-08-03 08:18:51.953884,2024-08-03 08:18:51.953894
4,NÃO CONSIGO SER CORRUPTO - FÓRUM MACHICO,2024-09-07 21:00:00,2024-09-07 23:00:00,Auditorium Forum Machico,Praia de Machico 9200-108 Machico Portugal,[[[<p>O Faustino é político. todos os político...,,https://www.eventbrite.pt/e/nao-consigo-ser-co...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,4Litro,10.00 EUR,2024-08-03 08:18:52.700292,2024-08-03 08:18:52.700293
5,Concertos L - Hatis Noit,2024-08-14 22:00:00,2024-08-14 23:30:00,Estalagem da Ponta do Sol,"Quinta da Rocinha, 06 9360 -529 Ponta do Sol P...",[[[<p>Hatis Noit é uma vocalista japonesa vind...,,https://www.eventbrite.com/e/concertos-l-hatis...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Estalagem da Ponta do Sol,Free,2024-08-03 08:18:54.023966,2024-08-03 08:18:54.023968
6,Concertos L - André Santos convida Teresinha L...,2024-08-21 22:00:00,2024-08-21 23:30:00,Estalagem da Ponta do Sol,"Quinta da Rocinha, 06 9360 -529 Ponta do Sol P...","[[[<p>“Este ano, para o palco mágico da Estala...",,https://www.eventbrite.com/e/concertos-l-andre...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Estalagem da Ponta do Sol,Free,2024-08-03 08:18:55.287176,2024-08-03 08:18:55.287177
7,Community Meditation & Inspiration Sessions,2024-08-06 13:30:00,2024-08-06 13:30:00,Sangha Cowork Funchal,98 Rua da Casa Branca 9004-535 Funchal Portugal,[[[<p>🌿 <strong><strong>Community Meditation a...,,https://www.eventbrite.pt/e/community-meditati...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Sangha Cowork,Free,2024-08-03 08:18:56.906879,2024-08-03 08:18:56.906886
8,Free trial day,2024-08-05 21:30:00,2024-08-05 18:30:00,Sangha Cowork Funchal,98 Rua da Casa Branca 9004-535 Funchal Portugal,[[[<p>We know there are always a lot of consid...,,https://www.eventbrite.pt/e/free-trial-day-tic...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Sangha Cowork,Free,2024-08-03 08:18:59.088715,2024-08-03 08:18:59.088716
9,Beach REIA (Monthly Real Estate Meeting) Only ...,2024-08-15 18:30:00,2024-08-15 21:30:00,Hooter's John's Pass,"192 Johns Pass, Boardwalk Pl W, Madeira Beach,...",[[[<p><strong>Join us for our monthly real est...,,https://www.eventbrite.com/e/beach-reia-monthl...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,"Derick O, Mark L, and Rex G",Free,2024-08-03 08:19:00.586332,2024-08-03 08:19:00.586333


In [None]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.service import Service

# from time import sleep
# service = Service(executable_path='/Users/valeryiachyzhova/Desktop/chromedriver-mac-x64/chromedriver')
# options = webdriver.ChromeOptions()
# driver = webdriver.Chrome(service=service, options=options)

# driver.get('https://www.eventbrite.com/e/kaue-tickets-940285929527?aff=ebdssbdestsearch')
# sleep(100)

# page_source = driver.page_source
# soup = BeautifulSoup(page_source, 'html.parser')

# print(soup)

# driver.quit()