In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime, timedelta
from dateutil.parser import parse
import locale


In [2]:
COMMA = ','
TODAY = 'Today'
SPACE = ' '
DASH = ' - '

In [3]:
def parse_weekday(date):
    weekday = date.split(' ')[0]
    time = date.split(' ')[2]
    hour = int(time.split(':')[0])
    minute = int(time.split(':')[1])

    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    target_day = days.index(weekday)

    now = datetime.now()

    days_to_next_target_day = (target_day - now.weekday() + 7) % 7
    next_target_day = now + timedelta(days=days_to_next_target_day)

    next_target_day_at_specific_time = next_target_day.replace(hour=hour, minute=minute, second=0, microsecond=0)

    return next_target_day_at_specific_time


def parse_date(date): 
    if COMMA in date: 
        date_object = datetime.strptime(date, "%a, %b %d, %I:%M %p").replace(year=datetime.now().year)
    else: 
        if TODAY in date: 
            hour = int(date.split(' ')[2].split(':')[0])
            minute = int(date.split(' ')[2].split(':')[1])
            date_object = datetime.now().replace(hour=hour, minute=minute, second=0, microsecond=0)
        else:
            date_object = parse_weekday(date)
            
    return date_object

def parse_address(address):
    location = address.find('p', class_='location-info__address-text')
    event_location = location.text
    # remove button and location elements from div with address
    address.find('div', class_='map-button-toggle').decompose()
    location.decompose()
    event_address = address.text.strip()
    return event_address, event_location

def parse_event_date(event_date):
    locale.setlocale(locale.LC_TIME, 'en_US.UTF-8')
    if ' · ' in event_date:
        # format Saturday, August 3 · 6 - 11:59pm WEST
        if COMMA in event_date.split(' · ')[0]:
            event_month = event_date.split(' · ')[0].split(SPACE)[1]
            event_day = event_date.split(' · ')[0].split(SPACE)[2]
            event_month_end = event_date.split(' · ')[0].split(SPACE)[1]
            event_day_end = event_date.split(' · ')[0].split(SPACE)[2]
            event_start_time = event_date.split(' · ')[1].split(' - ')[0]
            event_end_time = event_date.split(' · ')[1].split(' - ')[1].split(SPACE)[0]
        # format August 20 · 7pm - August 21 · 2am WEST
        else:
            event_month = event_date.split(' · ')[0].split(SPACE)[0]
            event_day = event_date.split(' · ')[0].split(SPACE)[1]
            event_month_end = event_date.split(' · ')[1].split(DASH)[1].split(SPACE)[0]
            event_day_end = event_date.split(' · ')[1].split(DASH)[1].split(SPACE)[1]
            event_start_time = event_date.split(' · ')[1].split(' - ')[0]
            event_end_time = event_date.split(' · ')[2].split(SPACE)[0]

        time_postfix = None

        # print(event_month, event_day, event_start_time)
        # print(event_month_end, event_day_end, event_end_time)


        if 'am' in event_start_time or 'pm' in event_start_time:
            time_postfix = event_start_time[-2:]
            event_start_time = event_start_time[:-2]
        
        if 'am' in event_end_time or 'pm' in event_end_time:
            time_postfix = event_end_time[-2:]
            event_end_time = event_end_time[:-2]
        
        dt_string = event_month + SPACE + event_day + SPACE + str(datetime.now().year) + SPACE + event_start_time + SPACE + (time_postfix.upper() if time_postfix else "PM")
        dt_string_end = event_month_end + SPACE + event_day_end + SPACE + str(datetime.now().year) + SPACE + event_end_time + SPACE + (time_postfix.upper() if time_postfix else "PM")
        if ":" in event_start_time:
            date_object = datetime.strptime(dt_string, '%B %d %Y %I:%M %p')
        else:
            date_object = datetime.strptime(dt_string, '%B %d %Y %I %p')
        
        if ":" in event_end_time:
            date_object_end = datetime.strptime(dt_string_end, '%B %d %Y %I:%M %p')
        else:
            date_object_end = datetime.strptime(dt_string_end, '%B %d %Y %I %p')
    else:
        try:
            print(event_date.split(' - ')[0], event_date.split(' - ')[1])
            date_object = datetime.strptime(event_date.split(' - ')[0], "%a, %d %b %Y %H:%M")
            print(len(event_date.split(' - ')[1]) < 12)
            if len(event_date.split(' - ')[1]) < 12: 
                time = event_date.split(' - ')[1].split(SPACE)[0] + ':00'
                print(date_object)
                date_object = date_object.replace(hour=int(time.split(':')[0]), minute=int(time.split(':')[1]), second=0, microsecond=0)
                print(date_object)
                date_object_end = date_object
            else:
                date_object_end = datetime.strptime(event_date.split(' - ')[1][:-5], "%a, %d %b %Y %H:%M") if 'WEST' in event_date else datetime.strptime(event_date.split(' - ')[1][:-4], "%a, %d %b %Y %H:%M") if 'WET' in event_date or "EDT" in event_date else datetime.strptime(event_date.split(' - ')[1], "%a, %d %b %Y %H:%M")
        except:
            locale.setlocale(locale.LC_TIME, 'pt_PT.UTF-8')
            date_format = '%a, %d de %b de %Y %H:%M'
            if DASH in event_date:
                date_object = datetime.strptime(event_date.split(' - ')[0], date_format)
                date_object_end = datetime.strptime(event_date.split(' - ')[1][:-5], date_format) if 'WEST' in event_date else datetime.strptime(event_date.split(' - ')[1], date_format)
            else:
                date_object = datetime.strptime(event_date, date_format)
                date_object_end = date_object
            

    # event_end_time = event_date.split(' · ')[1].split(' - ')[1].split(SPACE)[0]
    print(event_date)
    print(date_object, date_object_end)
    return date_object, date_object_end

# print(parse_event_date('Saturday, July 27 · 3 - 11:30pm WEST'))
# print(parse_event_date('August 1 · 7pm - August 2 · 1am WEST'))
# print(parse_event_date('Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET'))
# print(parse_event_date('Thursday, July 18 · 6:30 - 9:30pm EDT'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00 - dom, 11 de ago de 2024 03:00 WEST'))
# print(parse_date('Sat, Jul 27, 3:00 PM'))
# print(parse_date('Thursday at 4:00 PM'))

# print(parse_event_date('August 1 · 7pm - August 2 · 1am WEST'))
# print(parse_event_date('sáb, 10 de ago de 2024 22:00 - dom, 11 de ago de 2024 03:00 WEST'))
# print(parse_event_date('Saturday, August 3 · 6 - 11:59pm WEST'))
# print(parse_event_date('August 20 · 7pm - August 21 · 2am WEST'))
# print(parse_event_date('Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET'))
# print(parse_event_date('Thursday, August 15 · 6:30 - 9:30pm EDT'))
# print(parse_event_date('Tue, 6 Aug 2024 12:30 - 13:30 WEST'))


In [4]:
# todo: think about what to do with pagination
# todo: think about do we need EventCardUrgencySignal__label - going fast, sold out, almost full etc
EVENTBRITE_ID = 2 
EVENTBRITE_URL = 'https://www.eventbrite.com/d/portugal--ilha-da-madeira--85687345/all-events/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(EVENTBRITE_URL, headers=headers)
time.sleep(5)

soup = BeautifulSoup(response.text, 'html.parser')
events = soup.find_all('div', class_='discover-search-desktop-card discover-search-desktop-card--hiddeable')

events


[<div class="discover-search-desktop-card discover-search-desktop-card--hiddeable"><section class="DiscoverHorizontalEventCard-module__cardWrapper___2_FKN"><div class="Container_root__4i85v NestedActionContainer_root__1jtfr event-card event-card__horizontal horizontal-event-card__action-visibility" style="--ContainerBgColor:#fff;--ContainerBorderRadius:16px;--ContainerElevationFocusWithin:0px 2px 8px rgba(30, 10, 60, 0.06), 0px 4px 12px rgba(30, 10, 60, 0.08);--ContainerElevationHover:0px 2px 8px rgba(30, 10, 60, 0.06), 0px 4px 12px rgba(30, 10, 60, 0.08);--ContainerPadding:16px"><div data-testid="event-card-tracking-layer" style="position:absolute;top:0px;left:0px;height:100%;pointer-events:none;width:100%"></div><section class="horizontal-event-card__column" style="--HorizontalCardColumnMarginRight:24px"><a aria-label="View ´MADEIRADiG 2024" class="event-card-link" data-event-category="music" data-event-id="943637855227" data-event-location="Estreito da Calheta, Autonome Region Madei

In [5]:
fields = ["title", "start_date", "end_date", "location", 'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']

data = {field: [] for field in fields}
event_links = []

for event in events: 
    event_card = event.find('a', class_='event-card-link')
    event_link = event_card.get('href')

    event_links.append(event_link)

In [6]:
import json

def get_price(soup):
    # Find all script tags and then find the one containing '__SERVER_DATA__'
    script_tags = soup.find_all('script')
    script_tag = None
    for tag in script_tags:
        if '__SERVER_DATA__' in tag.text:
            script_tag = tag
            break
    if script_tag:
        # Extract the JSON string
        json_str = script_tag.string.split('=', 1)[1].strip()
        # Load the JSON string into a Python dictionary
        data = json.loads(json_str[:-1])
        is_free = data['event_listing_response']['tickets']['ticketClasses'][0]['characteristics']['isFree']
        if is_free:
            minimum_ticket_price_rounded = 'Free'
        else:
            # Access the minimumTicketPriceRounded value
            minimum_ticket_price_rounded = data['event_listing_response']['tickets']['ticketClasses'][0]['cost']['display']
    else:
        minimum_ticket_price_rounded = None
    
    return minimum_ticket_price_rounded

def get_category(soup):
    # Find all script tags and then find the one containing '__SERVER_DATA__'
    script_tags = soup.find_all('script')
    script_tag = None
    for tag in script_tags:
        if '__SERVER_DATA__' in tag.text:
            script_tag = tag
            break
    if script_tag:
        # Extract the JSON string
        json_str = script_tag.string.split('=', 1)[1].strip()
        # Load the JSON string into a Python dictionary
        data = json.loads(json_str[:-1])
        category = data['event']['category']
    else:
        category = 'Other'
    return category


In [7]:
# fields = ["title", "start_date", "end_date", "location", 'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']
fields = ["title", "start_date", "end_date", "categories", "location", 'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']

data = {field: [] for field in fields}

for link in event_links:
    print(link)
    res = requests.get(link, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    event_title = soup.find('h1', class_='event-title').text
    event_price = soup.find('div', class_='conversion-bar__panel-info').text if soup.find('div', class_='conversion-bar__panel-info') else get_price(soup)

    address = soup.find('div', class_='location-info__address')
    event_address, event_location = parse_address(address)

    date = soup.find('span', class_='date-info__full-datetime').text
    print(date)
    event_start_date, event_end_date = parse_event_date(date)

    event_description = str(soup.find('div', class_='event-description__content'))
    event_short_description = None

    event_image_link = soup.find('div', class_='event-hero').find('picture').find('img').get('src')

    event_organizer = soup.find('a', class_='descriptive-organizer-info-mobile__name-link').text if soup.find('a', class_='descriptive-organizer-info-mobile__name-link') else None
    event_source_link = link
    event_source_id = EVENTBRITE_ID
    
    event_last_inserted_at = datetime.now()
    event_last_updated_at = datetime.now()

    event_categories = get_category(soup)

    data['title'].append(event_title)
    data['address'].append(event_address)
    data['location'].append(event_location)
    data['price'].append(event_price)
    data['start_date'].append(event_start_date)
    data['organizer'].append(event_organizer)

    data['end_date'].append(event_end_date)
    data['description'].append(event_description)
    data['short_description'].append(event_short_description)
    data['categories'].append(event_categories)
    data['image_link'].append(event_image_link)

    data['source_link'].append(event_source_link)
    data['source_id'].append(event_source_id)
    data['last_inserted_at'].append(event_last_inserted_at)
    data['last_updated_at'].append(event_last_updated_at)

    # print(event_title, event_price, event_address, event_location, event_start_date, event_image_link, event_organizer, event_source_link, event_source_id, event_last_inserted_at, event_last_updated_at)
    print(event_description)


https://www.eventbrite.co.uk/e/madeiradig-2024-tickets-943637855227?aff=ebdssbdestsearch
Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET
Fri, 29 Nov 2024 21:00 Mon, 2 Dec 2024 23:30 WET
False
Fri, 29 Nov 2024 21:00 - Mon, 2 Dec 2024 23:30 WET
2024-11-29 21:00:00 2024-12-02 23:30:00
<div class="has-user-generated-content event-description__content event-description__content--expanded" id="event-description"><div class="eds-l-mar-vert-6 eds-l-sm-mar-vert-4 eds-text-bm structured-content-rich-text"><div class="eds-text--left"><p><strong><strong>MADEIRADiG is back for 2024. Save the date: Thursday, 28th November — Tuesday, 3rd December. See you for our annual hideaway retreat.</strong></strong></p><p><strong><strong>While the rest of Europe is cold and dreary, experience some of the most formidable avant-garde musicians in the world and stay in a gorgeous Design Hotel — situated on a clifftop in the middle of the Atlantic Ocean. This intimate festival on the mountainous volcanic island 

In [8]:
for key, value in data.items():
    print(f"Length of {key}: {len(value)}")

Length of title: 9
Length of start_date: 9
Length of end_date: 9
Length of categories: 9
Length of location: 9
Length of address: 9
Length of description: 9
Length of short_description: 9
Length of source_link: 9
Length of source_id: 9
Length of image_link: 9
Length of organizer: 9
Length of price: 9
Length of last_inserted_at: 9
Length of last_updated_at: 9


In [9]:
df = pd.DataFrame(data)
df.index = pd.RangeIndex(start=200, stop=200 + len(df))
df = df.reset_index().rename(columns={'index': 'id'})

df

Unnamed: 0,id,title,start_date,end_date,categories,location,address,description,short_description,source_link,source_id,image_link,organizer,price,last_inserted_at,last_updated_at
0,200,´MADEIRADiG 2024,2024-11-29 21:00:00,2024-12-02 23:30:00,Music,MUDAS - Museu de Arte Contemporânea da Madeira,Estrada Simão Gonçalves Câmara 37 Ilha da Made...,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.co.uk/e/madeiradig-2024...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,MADEIRADiG - Festival for Digital Music & Art,€20,2024-08-03 18:57:05.092863,2024-08-03 18:57:05.092865
1,201,ILLUZION DANSU - Jin Garden (Funchal),2024-08-20 07:00:00,2024-08-21 02:00:00,Arts,Jin Garden,3 Avenida Sá Carneiro #2º 9000-017 Funchal Por...,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.pt/e/illuzion-dansu-jin...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,ILLUZION TECHNO,From €11.89,2024-08-03 18:57:06.887708,2024-08-03 18:57:06.887709
2,202,Summer Night Dance - Летний вечер & Танцы,2024-08-10 22:00:00,2024-08-11 03:00:00,Music,NOMADE Restaurante,108 Rua Imperatriz Dona Amelia 9000-018 Funcha...,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.de/e/bilhetes-summer-ni...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Code coach,10.00 EUR,2024-08-03 18:57:08.397974,2024-08-03 18:57:08.397976
3,203,NÃO CONSIGO SER CORRUPTO - FÓRUM MACHICO,2024-09-07 21:00:00,2024-09-07 23:00:00,Arts,Auditorium Forum Machico,Praia de Machico 9200-108 Machico Portugal,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.pt/e/nao-consigo-ser-co...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,4Litro,10.00 EUR,2024-08-03 18:57:09.915214,2024-08-03 18:57:09.915215
4,204,Concertos L - Hatis Noit,2024-08-14 22:00:00,2024-08-14 23:30:00,Arts,Estalagem da Ponta do Sol,"Quinta da Rocinha, 06 9360 -529 Ponta do Sol P...","<div class=""has-user-generated-content event-d...",,https://www.eventbrite.com/e/concertos-l-hatis...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Estalagem da Ponta do Sol,Free,2024-08-03 18:57:11.426558,2024-08-03 18:57:11.426559
5,205,Concertos L - André Santos convida Teresinha L...,2024-08-21 22:00:00,2024-08-21 23:30:00,Music,Estalagem da Ponta do Sol,"Quinta da Rocinha, 06 9360 -529 Ponta do Sol P...","<div class=""has-user-generated-content event-d...",,https://www.eventbrite.com/e/concertos-l-andre...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Estalagem da Ponta do Sol,Free,2024-08-03 18:57:13.281737,2024-08-03 18:57:13.281741
6,206,Community Meditation & Inspiration Sessions,2024-08-06 13:30:00,2024-08-06 13:30:00,Health,Sangha Cowork Funchal,98 Rua da Casa Branca 9004-535 Funchal Portugal,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.pt/e/community-meditati...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Sangha Cowork,Free,2024-08-03 18:57:15.023942,2024-08-03 18:57:15.023950
7,207,Free trial day,2024-08-05 21:30:00,2024-08-05 18:30:00,Community,Sangha Cowork Funchal,98 Rua da Casa Branca 9004-535 Funchal Portugal,"<div class=""has-user-generated-content event-d...",,https://www.eventbrite.pt/e/free-trial-day-tic...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,Sangha Cowork,Free,2024-08-03 18:57:17.369816,2024-08-03 18:57:17.369817
8,208,Beach REIA (Monthly Real Estate Meeting) Only ...,2024-08-15 18:30:00,2024-08-15 21:30:00,Business,Hooter's John's Pass,"192 Johns Pass, Boardwalk Pl W, Madeira Beach,...","<div class=""has-user-generated-content event-d...",,https://www.eventbrite.com/e/beach-reia-monthl...,2,https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc....,"Derick O, Mark L, and Rex G",Free,2024-08-03 18:57:19.015084,2024-08-03 18:57:19.015085


In [10]:
from sqlalchemy import create_engine, MetaData, Table, select
alchemyEngine = create_engine('postgresql://default:qml3K7QroCMw@ep-round-field-a2kcdmjb-pooler.eu-central-1.aws.neon.tech:5432/verceldb?sslmode=require', pool_recycle=3600)
dbConnection = alchemyEngine.connect()


In [66]:
# df.to_sql('event', dbConnection1, if_exists='append', index=False)

In [67]:
# dbConnection1.close()


In [11]:
# get categories table from db

metadata = MetaData()

table = Table('category', metadata, autoload_with=alchemyEngine)
result = dbConnection.execute(select(table))

categories_df = pd.DataFrame(result, columns=table.columns.keys())
categories_df




Unnamed: 0,id,title,last_inserted_at,last_updated_at
0,1,Culture,2023-11-29 11:22:30.660,2023-11-29 11:22:30.629
1,2,Music,2023-11-29 12:08:27.435,2023-11-29 12:08:27.435
2,3,Christmas,2023-11-29 12:14:53.610,2023-11-29 12:14:53.610
3,4,Other,2023-11-29 12:24:58.757,2023-11-29 12:24:58.757
4,5,Arraiais,2023-11-29 12:37:03.950,2023-11-29 12:37:03.950
5,6,Gastronomy,2023-11-29 12:37:03.952,2023-11-29 12:37:03.952
6,7,Official Poster,2023-11-29 12:37:06.189,2023-11-29 12:37:06.189
7,8,Religious,2023-11-29 12:37:13.743,2023-11-29 12:37:13.743
8,9,Sports,2023-11-29 12:37:19.705,2023-11-29 12:37:19.705
9,10,Parade,2023-11-29 12:37:30.476,2023-11-29 12:37:30.476


In [12]:
event_to_category_df = pd.DataFrame(columns=['event_id', 'category_id'])
data = []
for index, row in df.iterrows():
    categories = row['categories'].split(', ')
    id = row['id']
    for category in categories:
        category_id = categories_df[categories_df['title'] == category]['id'].values
        if len(category_id) > 0:
            data.append({'event_id': id, 'category_id': int(category_id[0])})

event_to_category_df = pd.DataFrame(data)
event_to_category_df['category_id'] = event_to_category_df['category_id'].astype('Int64')

event_to_category_df

Unnamed: 0,event_id,category_id
0,200,2
1,202,2
2,205,2
3,206,16
4,207,14
5,208,17


In [13]:
event_to_category_df.to_sql('event_to_category', dbConnection, if_exists='append', index=False)


IntegrityError: (psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "event_id_category_id_unique"
DETAIL:  Key (event_id, category_id)=(200, 2) already exists.

[SQL: INSERT INTO event_to_category (event_id, category_id) VALUES (%(event_id__0)s, %(category_id__0)s), (%(event_id__1)s, %(category_id__1)s), (%(event_id__2)s, %(category_id__2)s), (%(event_id__3)s, %(category_id__3)s), (%(event_id__4)s, %(category_id__4)s), (%(event_id__5)s, %(category_id__5)s)]
[parameters: {'event_id__0': 200, 'category_id__0': 2, 'event_id__1': 202, 'category_id__1': 2, 'event_id__2': 205, 'category_id__2': 2, 'event_id__3': 206, 'category_id__3': 16, 'event_id__4': 207, 'category_id__4': 14, 'event_id__5': 208, 'category_id__5': 17}]
(Background on this error at: https://sqlalche.me/e/20/gkpj)

In [14]:
dbConnection.close()

In [71]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.service import Service

# from time import sleep
# service = Service(executable_path='/Users/valeryiachyzhova/Desktop/chromedriver-mac-x64/chromedriver')
# options = webdriver.ChromeOptions()
# driver = webdriver.Chrome(service=service, options=options)

# driver.get('https://www.eventbrite.com/e/kaue-tickets-940285929527?aff=ebdssbdestsearch')
# sleep(100)

# page_source = driver.page_source
# soup = BeautifulSoup(page_source, 'html.parser')

# print(soup)

# driver.quit()

In [72]:
# get categories table from db

# metadata = MetaData()

# table = Table('event_to_category', metadata, autoload_with=alchemyEngine)
# result = dbConnection.execute(select(table))

# categories_df = pd.DataFrame(result, columns=table.columns.keys())
# categories_df

ResourceClosedError: This Connection is closed