In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine, MetaData, Table, select



In [20]:
# fields = ["title", "start_date", "end_date", "location",  'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']
fields = ["title", "start_date", "end_date", "location", 'categories',  'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']

empty_fields = ['organizer', 'price', 'short_description']

EVENTS_MADEIRA_URL = 'https://eventsmadeira.com/en/event-listing/'
EVENTS_MADEIRA_ID = 1
EMPTY_FIELD = None


In [21]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(EVENTS_MADEIRA_URL, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
divs = soup.find_all('div', class_='gt-event-style-3')


In [26]:
# get text from li elems and remove duplicities (mainly for gt-inner ul which could contain duplicities)
def get_data_text(result):
    list_items_text = [li.text for li in result.find_all('li')]
    if list_items_text and len(list_items_text) > 1:
        # remove duplicities
        return ', '.join(list(set(list_items_text)))
    return result.text

# fill data dict with date and time in datetime format
def fill_datetime(info_container, data):
    datetime_labels = [('start_date', 'gt-start-date'), ('end_date', 'gt-end-date')]
    for date_label, class_name in datetime_labels:
        datetime_scrapped = info_container.find('li', {'class': class_name})

        if datetime_scrapped is not None:
            datetime_ = datetime_scrapped.find('div', {'class': 'gt-inner'}).text
            datetime_obj = datetime.strptime(datetime_, '%d/%m/%Y %H:%M') if len(datetime_.split(' ')) > 0 else datetime.strptime(datetime_, '%d/%m/%Y')

            data[date_label].append(datetime_obj)

        else:
            data[date_label].append(EMPTY_FIELD)

# fill data dict with text from p tags from event detail page
def fill_description(page_content, data):
    description = page_content.find('div', {'class': 'gt-content'})
    data['description'].append(' '.join([p.text for p in description]).replace('\n', '') if description else EMPTY_FIELD)

# fill data dict with data from event detail page
def fill_event_data(url, data):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get parts of page
    title_bar = soup.find('div', {'class': 'gt-page-title-bar'})
    page_content = soup.find('div', {'class': 'gt-page-content'})
    info_container = soup.find('div', {'class': 'gt-content-detail-box'})

    if not info_container or not page_content:
        return None
    else: 
        fill_datetime(info_container, data)
        fill_description(page_content, data)

        title = title_bar.find('h1').text if title_bar else EMPTY_FIELD
        address = info_container.find('li', {'class': 'gt-address'}).find('div', {'class': 'gt-inner'}).text if info_container.find('li', {'class': 'gt-address'}) else EMPTY_FIELD
        location = get_data_text(info_container.find('li', {'class': 'gt-locations'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-locations'}) else EMPTY_FIELD
        categories = get_data_text(info_container.find('li', {'class': 'gt-categories'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-categories'}) else EMPTY_FIELD
        imageLink = page_content.find('div', {'class': 'gt-image'}).find('img')['data-src'] if page_content.find('div', {'class': 'gt-image'}) else EMPTY_FIELD

        data['title'].append(title)
        data['address'].append(address)
        data['location'].append(location)
        data['categories'].append(categories)
        data['image_link'].append(imageLink)

        data['source_link'].append(url)
        data['source_id'].append(EVENTS_MADEIRA_ID)
        data['last_inserted_at'].append(datetime.now())
        data['last_updated_at'].append(datetime.now())

        
        for field in empty_fields:
            data[field].append(EMPTY_FIELD)


In [27]:
# initialize dict with empty lists
data = {field: [] for field in fields}

for div in divs:
    url = div.find('div', class_='gt-title').find('a')['href']
    result = fill_event_data(url, data)


In [28]:
for key, value in data.items():
    print(f"Length of {key}: {len(value)}")


Length of title: 82
Length of start_date: 82
Length of end_date: 82
Length of location: 82
Length of categories: 82
Length of address: 82
Length of description: 82
Length of short_description: 82
Length of source_link: 82
Length of source_id: 82
Length of image_link: 82
Length of organizer: 82
Length of price: 82
Length of last_inserted_at: 82
Length of last_updated_at: 82


In [29]:
df = pd.DataFrame(data)
df = df.reset_index().rename(columns={'index': 'id'})

# dbConnection.close()

df


Unnamed: 0,id,title,start_date,end_date,location,categories,address,description,short_description,source_link,source_id,image_link,organizer,price,last_inserted_at,last_updated_at
0,0,Waterline exhibition,2023-06-20 10:00:00,2028-06-20 18:00:00,Funchal,Culture,Museu de História Natural,The Natural History Museum of Funchal hosts t...,,https://eventsmadeira.com/en/event/waterline-e...,1,https://eventsmadeira.com/oordypsa/2023/06/Mus...,,,2024-06-30 13:27:16.719386,2024-06-30 13:27:16.719392
1,1,Exhibit Siza & Oscar beyond the sea,2024-04-12 09:00:00,2024-09-13 17:30:00,Funchal,Culture,Fortaleza de São João Baptista do Pico,"The Fortress of São João Baptista do Pico, in...",,https://eventsmadeira.com/en/event/exhibit-siz...,1,https://eventsmadeira.com/oordypsa/2024/04/FF-...,,,2024-06-30 13:27:16.820302,2024-06-30 13:27:16.820308
2,2,Mandolin Orchestra Weekly Concerts,2024-06-07 21:00:00,2024-06-28 22:00:00,Funchal,"Music, Culture",Assembleia Legislativa Regional,Recognized as the oldest and youngest Mandoli...,,https://eventsmadeira.com/en/event/mandolin-or...,1,https://eventsmadeira.com/oordypsa/2022/09/289...,,,2024-06-30 13:27:16.998434,2024-06-30 13:27:16.998440
3,3,Teatro do Avesso presents – Comedy “Happy Hour”,2024-06-27 21:00:00,2024-06-30 20:00:00,"Calheta, Ponta do Sol",Theatre,,The new production by Associação Avesso has a...,,https://eventsmadeira.com/en/event/teatro-do-a...,1,https://eventsmadeira.com/oordypsa/2022/10/Com...,,,2024-06-30 13:27:17.133793,2024-06-30 13:27:17.133799
4,4,Saint Peter’s Festivities,2024-06-27 22:00:00,2024-07-01 23:59:00,Ribeira Brava,Arraiais,,Ribeira Brava’s headline event celebrates Sai...,,https://eventsmadeira.com/en/event/saint-peter...,1,https://eventsmadeira.com/oordypsa/2021/06/sao...,,,2024-06-30 13:27:17.244623,2024-06-30 13:27:17.244631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,Madeira Classic Car Revival,2025-05-23 18:00:00,2025-05-25 19:00:00,Funchal,Antiques,,"This event, which takes place at Praça do Pov...",,https://eventsmadeira.com/en/event/madeira-cla...,1,https://eventsmadeira.com/oordypsa/2022/05/Mad...,,,2024-06-30 13:27:25.315618,2024-06-30 13:27:25.315622
78,78,Pyromusical Shows,2025-06-06 23:00:00,2025-06-28 23:20:00,"Funchal, Porto Santo, Machico","Other, Music",Machico | Porto Santo | Funchal,"The music and fireworks combine harmoniously,...",,https://eventsmadeira.com/en/event/pyromusical...,1,https://eventsmadeira.com/oordypsa/2020/12/fes...,,,2024-06-30 13:27:25.436695,2024-06-30 13:27:25.436700
79,79,Summer Sunsets,2025-06-06 21:30:00,2025-06-29 00:00:00,Funchal,"Official Poster, Music, Culture, Gastronomy",Praça do Povo,"At this time, the weekends are more lively an...",,https://eventsmadeira.com/en/event/summer-suns...,1,https://eventsmadeira.com/oordypsa/2023/06/Sun...,,,2024-06-30 13:27:25.539224,2024-06-30 13:27:25.539228
80,80,Atlantic Festival,2025-06-06 19:00:00,2025-06-29 23:30:00,Funchal,"Other, Official Poster",,With 20 years of existence this is the offici...,,https://eventsmadeira.com/en/event/atlantic-fe...,1,https://eventsmadeira.com/oordypsa/2020/12/Sin...,,,2024-06-30 13:27:25.652826,2024-06-30 13:27:25.652831


In [18]:
alchemyEngine = create_engine('postgresql://default:qml3K7QroCMw@ep-round-field-a2kcdmjb-pooler.eu-central-1.aws.neon.tech:5432/verceldb?sslmode=require', pool_recycle=3600)
dbConnection = alchemyEngine.connect()


# df.to_sql('event', dbConnection, if_exists='append', index=False)


82

In [30]:
# get categories table from db

metadata = MetaData()

table = Table('category', metadata, autoload_with=alchemyEngine)
result = dbConnection.execute(select(table))

categories_df = pd.DataFrame(result, columns=table.columns.keys())
categories_df


Unnamed: 0,id,title,last_inserted_at,last_updated_at
0,1,Culture,2023-11-29 11:22:30.660,2023-11-29 11:22:30.629
1,2,Music,2023-11-29 12:08:27.435,2023-11-29 12:08:27.435
2,3,Christmas,2023-11-29 12:14:53.610,2023-11-29 12:14:53.610
3,4,Other,2023-11-29 12:24:58.757,2023-11-29 12:24:58.757
4,5,Arraiais,2023-11-29 12:37:03.950,2023-11-29 12:37:03.950
5,6,Gastronomy,2023-11-29 12:37:03.952,2023-11-29 12:37:03.952
6,7,Official Poster,2023-11-29 12:37:06.189,2023-11-29 12:37:06.189
7,8,Religious,2023-11-29 12:37:13.743,2023-11-29 12:37:13.743
8,9,Sports,2023-11-29 12:37:19.705,2023-11-29 12:37:19.705
9,10,Parade,2023-11-29 12:37:30.476,2023-11-29 12:37:30.476


In [31]:
import numpy as np

event_to_category_df = pd.DataFrame(columns=['event_id', 'category_id'])
data = []
for index, row in df.iterrows():
    categories = row['categories'].split(', ')
    id = row['id']
    for category in categories:
        category_id = categories_df[categories_df['title'] == category]['id'].values
        if len(category_id) > 0:
            data.append({'event_id': id, 'category_id': int(category_id[0])})

event_to_category_df = pd.DataFrame(data)
event_to_category_df['category_id'] = event_to_category_df['category_id'].astype('Int64')

event_to_category_df

Unnamed: 0,event_id,category_id
0,0,1
1,1,1
2,2,2
3,2,1
4,4,5
...,...,...
113,79,6
114,80,4
115,80,7
116,81,11


In [32]:
dbConnection.close()

In [35]:
alchemyEngine = create_engine('postgresql://default:qml3K7QroCMw@ep-round-field-a2kcdmjb-pooler.eu-central-1.aws.neon.tech:5432/verceldb?sslmode=require', pool_recycle=3600)
dbConnection = alchemyEngine.connect()

event_to_category_df.to_sql('event_to_category', dbConnection, if_exists='append', index=False)


118