In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from sqlalchemy import create_engine, MetaData, Table, select



In [2]:
# fields = ["title", "start_date", "end_date", "location",  'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']
fields = ["title", "start_date", "end_date", "location", 'categories',  'address', 'description', 'short_description', 'source_link', 'source_id', 'image_link', 'organizer', 'price', 'last_inserted_at', 'last_updated_at']

empty_fields = ['organizer', 'price', 'short_description']

EVENTS_MADEIRA_URL = 'https://eventsmadeira.com/en/event-listing/'
EVENTS_MADEIRA_ID = 1
EMPTY_FIELD = None


In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(EVENTS_MADEIRA_URL, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')
divs = soup.find('div', class_='gt-event-listing').find_all('div', class_='gt-event-style-3')


In [4]:
# get text from li elems and remove duplicities (mainly for gt-inner ul which could contain duplicities)
def get_data_text(result):
    list_items_text = [li.text for li in result.find_all('li')]
    if list_items_text and len(list_items_text) > 1:
        # remove duplicities
        return ', '.join(list(set(list_items_text)))
    return result.text

# fill data dict with date and time in datetime format
def fill_datetime(info_container, data):
    datetime_labels = [('start_date', 'gt-start-date'), ('end_date', 'gt-end-date')]
    for date_label, class_name in datetime_labels:
        datetime_scrapped = info_container.find('li', {'class': class_name})

        if datetime_scrapped is not None:
            datetime_ = datetime_scrapped.find('div', {'class': 'gt-inner'}).text
            datetime_obj = datetime.strptime(datetime_, '%d/%m/%Y %H:%M') if len(datetime_.split(' ')) > 0 else datetime.strptime(datetime_, '%d/%m/%Y')

            data[date_label].append(datetime_obj)

        else:
            data[date_label].append(EMPTY_FIELD)

# fill data dict with text from p tags from event detail page
def fill_description(page_content, data):
    description = page_content.find('div', {'class': 'gt-content'})
    data['description'].append(' '.join([p.text for p in description]).replace('\n', '') if description else EMPTY_FIELD)

# fill data dict with data from event detail page
def fill_event_data(url, data):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get parts of page
    title_bar = soup.find('div', {'class': 'gt-page-title-bar'})
    page_content = soup.find('div', {'class': 'gt-page-content'})
    info_container = soup.find('div', {'class': 'gt-content-detail-box'})

    if not info_container or not page_content:
        return None
    else: 
        fill_datetime(info_container, data)
        fill_description(page_content, data)

        title = title_bar.find('h1').text if title_bar else EMPTY_FIELD
        address = info_container.find('li', {'class': 'gt-address'}).find('div', {'class': 'gt-inner'}).text if info_container.find('li', {'class': 'gt-address'}) else EMPTY_FIELD
        location = get_data_text(info_container.find('li', {'class': 'gt-locations'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-locations'}) else EMPTY_FIELD
        categories = get_data_text(info_container.find('li', {'class': 'gt-categories'}).find('div', {'class': 'gt-inner'})) if info_container.find('li', {'class': 'gt-categories'}) else EMPTY_FIELD
        imageLink = page_content.find('div', {'class': 'gt-image'}).find('img')['data-src'] if page_content.find('div', {'class': 'gt-image'}) else EMPTY_FIELD

        data['title'].append(title)
        data['address'].append(address)
        data['location'].append(location)
        data['categories'].append(categories)
        data['image_link'].append(imageLink)

        data['source_link'].append(url)
        data['source_id'].append(EVENTS_MADEIRA_ID)
        data['last_inserted_at'].append(datetime.now())
        data['last_updated_at'].append(datetime.now())

        
        for field in empty_fields:
            data[field].append(EMPTY_FIELD)


In [5]:
# initialize dict with empty lists
data = {field: [] for field in fields}

for div in divs:
    url = div.find('div', class_='gt-title').find('a')['href']
    result = fill_event_data(url, data)


In [6]:
for key, value in data.items():
    print(f"Length of {key}: {len(value)}")


Length of title: 62
Length of start_date: 62
Length of end_date: 62
Length of location: 62
Length of categories: 62
Length of address: 62
Length of description: 62
Length of short_description: 62
Length of source_link: 62
Length of source_id: 62
Length of image_link: 62
Length of organizer: 62
Length of price: 62
Length of last_inserted_at: 62
Length of last_updated_at: 62


In [7]:
df = pd.DataFrame(data)
df = df.reset_index().rename(columns={'index': 'id'})

# dbConnection.close()

df


Unnamed: 0,id,title,start_date,end_date,location,categories,address,description,short_description,source_link,source_id,image_link,organizer,price,last_inserted_at,last_updated_at
0,0,Waterline exhibition,2023-06-20 10:00:00,2028-06-20 18:00:00,Funchal,Culture,Museu de História Natural,The Natural History Museum of Funchal hosts t...,,https://eventsmadeira.com/en/event/waterline-e...,1,https://eventsmadeira.com/oordypsa/2023/06/Mus...,,,2024-08-03 19:15:37.563972,2024-08-03 19:15:37.563977
1,1,Exhibit Siza & Oscar beyond the sea,2024-04-12 09:00:00,2024-09-13 17:30:00,Funchal,Culture,Fortaleza de São João Baptista do Pico,"The Fortress of São João Baptista do Pico, in...",,https://eventsmadeira.com/en/event/exhibit-siz...,1,https://eventsmadeira.com/oordypsa/2024/04/FF-...,,,2024-08-03 19:15:37.648644,2024-08-03 19:15:37.648650
2,2,Machico Gastronomic Week,2024-07-26 09:00:00,2024-08-04 00:00:00,Machico,"Music, Arraiais, Gastronomy",Machico,"Already in its 37th edition, the Machico Gast...",,https://eventsmadeira.com/en/event/machico-gas...,1,https://eventsmadeira.com/oordypsa/2023/05/Sem...,,,2024-08-03 19:15:41.155339,2024-08-03 19:15:41.155345
3,3,L Concerts,2024-07-27 22:00:00,2024-10-31 00:00:00,Ponta do Sol,Music,,"Estalagem da Ponta do Sol, in collaboration w...",,https://eventsmadeira.com/en/event/l-concerts/,1,https://eventsmadeira.com/oordypsa/2024/07/Est...,,,2024-08-03 19:15:41.264449,2024-08-03 19:15:41.264455
4,4,Madeira Wine Rally,2024-08-01 08:00:00,2024-08-03 23:30:00,"São Vicente, Calheta, Santana, Ribeira Brava, ...",Sports,,The Madeira Wine Rally attracts thousands of ...,,https://eventsmadeira.com/en/event/madeira-win...,1,https://eventsmadeira.com/oordypsa/2021/02/RVM...,,,2024-08-03 19:15:41.369092,2024-08-03 19:15:41.369098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,57,Pyromusical Shows,2025-06-06 23:00:00,2025-06-28 23:20:00,"Machico, Porto Santo, Funchal","Other, Music",Machico | Porto Santo | Funchal,"The music and fireworks combine harmoniously,...",,https://eventsmadeira.com/en/event/pyromusical...,1,https://eventsmadeira.com/oordypsa/2020/12/fes...,,,2024-08-03 19:15:49.676596,2024-08-03 19:15:49.676602
58,58,Summer Sunsets,2025-06-06 21:30:00,2025-06-29 00:00:00,Funchal,"Official Poster, Music, Culture, Gastronomy",Praça do Povo,"At this time, the weekends are more lively an...",,https://eventsmadeira.com/en/event/summer-suns...,1,https://eventsmadeira.com/oordypsa/2023/06/Sun...,,,2024-08-03 19:15:49.780119,2024-08-03 19:15:49.780138
59,59,Atlantic Festival,2025-06-06 19:00:00,2025-06-29 23:30:00,Funchal,"Other, Official Poster",,With 20 years of existence this is the offici...,,https://eventsmadeira.com/en/event/atlantic-fe...,1,https://eventsmadeira.com/oordypsa/2020/12/Sin...,,,2024-08-03 19:15:49.882593,2024-08-03 19:15:49.882599
60,60,Sixteenth Century Market,2025-06-06 21:00:00,2025-06-08 21:00:00,Machico,"Culture, Antiques",,"Machico hosts between June 6th to 8th, the 16...",,https://eventsmadeira.com/en/event/sixteenth-c...,1,https://eventsmadeira.com/oordypsa/2022/05/Mac...,,,2024-08-03 19:15:49.980411,2024-08-03 19:15:49.980417


In [8]:
# alchemyEngine = create_engine('postgresql://default:qml3K7QroCMw@ep-round-field-a2kcdmjb-pooler.eu-central-1.aws.neon.tech:5432/verceldb?sslmode=require', pool_recycle=3600)
# dbConnection6 = alchemyEngine.connect()


# df.to_sql('event', dbConnection, if_exists='append', index=False)


In [9]:
# get categories table from db

metadata = MetaData()

table = Table('category', metadata, autoload_with=alchemyEngine)
result = dbConnection6.execute(select(table))

categories_df = pd.DataFrame(result, columns=table.columns.keys())
categories_df


NameError: name 'alchemyEngine' is not defined

In [None]:
import numpy as np

event_to_category_df = pd.DataFrame(columns=['event_id', 'category_id'])
data = []
for index, row in df.iterrows():
    categories = row['categories'].split(', ')
    id = row['id']
    for category in categories:
        category_id = categories_df[categories_df['title'] == category]['id'].values
        if len(category_id) > 0:
            data.append({'event_id': id, 'category_id': int(category_id[0])})

event_to_category_df = pd.DataFrame(data)
event_to_category_df['category_id'] = event_to_category_df['category_id'].astype('Int64')

event_to_category_df

Unnamed: 0,event_id,category_id
0,0,1
1,1,1
2,2,6
3,2,5
4,2,2
...,...,...
105,70,4
106,71,11
107,71,1
108,72,11


In [None]:
# dbConnection9.close()

In [None]:
# from sqlalchemy import create_engine, MetaData, Table, select

# alchemyEngine = create_engine('postgresql://default:qml3K7QroCMw@ep-round-field-a2kcdmjb-pooler.eu-central-1.aws.neon.tech:5432/verceldb?sslmode=require', pool_recycle=3600)
# dbConnection9 = alchemyEngine.connect()

# dbConnection.close()
 
# event_to_category_df.to_sql('event_to_category', dbConnection9, if_exists='append', index=False)
# 

110