In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

import time

from concurrent.futures import ThreadPoolExecutor

In [11]:
BASE_URL = "https://www.blocosderua.com/"  # Default São Paulo


def get_cities_urls():
    response = requests.get(BASE_URL)
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
    else:
        html_content = response.text
        soup = BeautifulSoup(html_content, "html.parser")
        select = soup.find("select", class_="dms-select")
        options = select.find_all("option")
        cities_urls = [option["value"] for option in options if option["value"] != ""]
        return cities_urls


cities_urls = get_cities_urls()
cities_urls.append(BASE_URL)

In [22]:
def get_city_events_links(city_url):
    city_events_links = []

    full_page_url = get_city_full_page_url(city_url)

    if full_page_url:
        i = 1
        while True:
            url_args = f"?paged={i}&data=&bairro="
            target_page = full_page_url + url_args
            print("URL:", target_page)

            response = requests.get(target_page)

            if response.status_code != 200:
                print("Erro Status:", response.status_code)
                break

            soup = BeautifulSoup(response.text, "html.parser")
            event_cards = soup.find_all("a", class_="card")
            event_links = [
                l["href"] for l in event_cards if "programacao/" in l["href"]
            ]
            if len(event_links) > 0:
                city_events_links.extend(event_links)
                i += 1
                time.sleep(0.5)
            else:
                break

    return city_events_links


def get_city_full_page_url(city_url):
    response = requests.get(city_url)
    if response.status_code != 200:
        print("Erro Status:", response.status_code)
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    btn_links = soup.find_all("a", class_="btn")
    full_page_url = [link["href"] for link in btn_links if city_url in link["href"]][0]

    if len(full_page_url) > 0:
        return full_page_url
    else:
        return None

In [None]:
for url in cities_urls:
    event_links = get_city_events_links(url)

    city = f"{url.split('/')[-2]}"
    if "www" in city:
        city = "sao-paulo"

    path = "../data/event_links.csv"

    df_city = pd.DataFrame(
        {"city": [city] * len(event_links), "event_link": event_links}
    )

    if os.path.exists(path):
        df_existing = pd.read_csv(path)
        df_city = pd.concat([df_existing, df_city]).drop_duplicates(subset="event_link")

    # Salva o arquivo sem sobrescrever o cabeçalho caso já exista
    df_city.to_csv(path, mode="w", header=True, index=False)

In [42]:
def fetch_event_page(city, url, output_dir):
    city_dir = os.path.join(output_dir, city)
    os.makedirs(city_dir, exist_ok=True)

    filename = url.split("/")[-2] + ".html"
    file_path = os.path.join(city_dir, filename)

    if os.path.exists(file_path):
        print("Skipping:", city, filename)
        return

    # Faz a requisição
    print("Fetch Data:", city, filename)
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Gera um erro se a requisição falhar

        # Salva o HTML no arquivo
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(response.text)
    except requests.RequestException as e:
        print(f"❌ Erro ao baixar {url}: {e}")


def fetch_and_save_event_pages(
    csv_path="../data/event_links.csv",
    output_dir="../data/events_data_raw",
    num_workers=3,
):
    """Lê o CSV e faz requisições para cada event_link, salvando o HTML em data/events/{city}/"""

    if not os.path.exists(csv_path):
        print(f"Arquivo {csv_path} não encontrado.")
        return

    df = pd.read_csv(csv_path)

    futures = []  # Lista para armazenar os Future objects

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        for _, row in df.iterrows():
            future = executor.submit(
                fetch_event_page, row["city"], row["event_link"], output_dir
            )
            futures.append(future)  # Adiciona o Future na lista

        # Aguarda a conclusão de todas as tarefas
        for future in futures:
            future.result()

In [None]:
fetch_and_save_event_pages()

In [75]:
files_data = []
for city_dir in os.listdir("../data/events_data_raw"):
    for filename in os.listdir(os.path.join("../data/events_data_raw", city_dir)):
        if filename.endswith(".html"):
            with open(
                os.path.join("../data/events_data_raw", city_dir, filename),
                "r",
                encoding="utf-8",
            ) as f:
                soup = BeautifulSoup(f)

            data = {}
            data["city"] = city_dir
            data["event_name"] = soup.find(
                "h1", class_=["text-secondary", "h2", "text-center"]
            ).text

            event_details = soup.find(
                "h2",
                class_=["card-text", "text-white", "h6", "text-center", "text-default"],
            )
            data["event_subtitle"] = event_details.text

            data["event_text"] = event_details.find_next("p").text
            ticket, address = [i for i in soup.find_all("h6")][:2]

            data["ticket"] = ticket.text.strip()
            data["ticket link"] = ticket.find("a")["href"] if ticket.find("a") else ""
            data["address"] = address.text.strip()
            data["address gmaps"] = address.find("a")["href"]
            data["event page link"] = "https://www.blocosderua.com/" + city_dir + "/programacao/" + filename.replace(".html", "")
            

            # print(data)
            files_data.append(data)
            # print(os.path.join("../data/events_data_raw", city_dir, filename))

    #         break
    # break


In [None]:
df = pd.DataFrame(files_data)
df[["date", "date_day", "time"]] = df["event_subtitle"].str.split(" - ", expand=True)
df["time"] = df["time"].str.split(" ").str[0]

# df["date"] = date
# df["date_day"] = date_day
# df["time"] = time


# df.to_csv("../data/event_data.csv", index=False)
df

In [None]:
df = pd.read_csv("../data/event_data.csv")
df

In [3]:
df = pd.read_csv("../data/event_data_with_coords.csv")
df

Unnamed: 0,city,event_name,event_subtitle,event_text,ticket,ticket_link,address,address_gmaps,event_page_link,date,date_day,time,latitude,longitude
0,rio-de-janeiro,Banda Cultural do Jiló – Pré-Carnaval,09/02/2025 - Domingo - 14:00 Tijuca,A Banda Cultural do Jiló faz seu Carnaval pela...,Grátis,,"R. Pinto de Figueiredo, 26a",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/rio-de-janeiro/pro...,09/02/2025,Domingo,14:00,-22.925482,-43.235090
1,rio-de-janeiro,Bloco dos Ferroviários Aposentados,28/02/2025 - Sexta - 16:00 Rocha Miranda,O Bloco está previsto para desfilar em Rocha M...,Grátis,,Rua do Parque Madureira,https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/rio-de-janeiro/pro...,28/02/2025,Sexta,16:00,-22.857081,-43.350508
2,rio-de-janeiro,Banda Haddock,22/02/2025 - Sábado - 16:00 Tijuca,A Banda Haddock anima o Carnaval com um trio e...,Grátis,,"R. Haddock Lobo, 359",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/rio-de-janeiro/pro...,22/02/2025,Sábado,16:00,-22.921913,-43.217437
3,rio-de-janeiro,Bloco da Anitta,08/03/2025 - Sábado - 07:00 Centro,"Ao som dos hits mais quentes do funk, axé e po...",Grátis,,"R. Primeiro de Março, 1",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/rio-de-janeiro/pro...,08/03/2025,Sábado,07:00,-22.877441,-43.503104
4,rio-de-janeiro,"Ensaio da Vila Isabel, Portela e Grande Rio",08/02/2025 - Sábado - 20:00 Santo Cristo,"As escolas de samba Vila Isabel, Portela e Gra...","A partir de R$ 90,00",https://www.sympla.com.br/evento/ensaio-tecnic...,"Rua Marquês de Sapucaí, 11",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/rio-de-janeiro/pro...,08/02/2025,Sábado,20:00,-22.909820,-43.197165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,sao-paulo,Festa Xainirô com Wanessa Camargo,16/02/2025 - Domingo - 18:00 Liberdade,Wanessa Camargo vai celebrar 25 anos de carrei...,"A partir de R$130,00",https://www.sympla.com.br/evento/festa-xainiro...,"Praça Carlos Gomes, 82",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/sao-paulo/programa...,16/02/2025,Domingo,18:00,-23.553380,-46.635949
1420,sao-paulo,Salgadinho Na Vila do Samba,14/02/2025 - Sexta - 21:00 Casa Verde,Salgadinho na Vila do Samba promete uma noite ...,"A partir de R$30,00",https://www.sympla.com.br/evento/salgadinho-na...,"Rua João Rudge, 340",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/sao-paulo/programa...,14/02/2025,Sexta,21:00,-23.513296,-46.655893
1421,sao-paulo,Resenha do Edgar – Ressaca de Carnaval,08/03/2025 - Sábado - 20:00 Santa Efigênia,Prepara o coração que a Resenha do Edgar vai f...,"A partir de R$10,00",https://www.sympla.com.br/evento/resenha-do-ed...,"Avenida Mem de Sá, 1205",https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/sao-paulo/programa...,08/03/2025,Sábado,20:00,-23.464668,-46.560091
1422,sao-paulo,Luziânia,23/02/2025 - Domingo - 13:00 Jardim Nordeste,O Bloco Luziânia é o encontro da tradição com ...,Grátis,,R. Luziânia,https://www.google.com/maps/dir/?api=1&destina...,https://www.blocosderua.com/sao-paulo/programa...,23/02/2025,Domingo,13:00,-23.533101,-46.484042
