# Instalando Bibliotecas

In [None]:
!pip install tqdm -q

# Importando Dependências

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import re
import time
from google.colab import drive
from tqdm.notebook import tqdm

# Carregando Google Drive

In [None]:
print("Montando o Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive montado com sucesso!")
except Exception as e:
    print(f"Erro ao montar o Google Drive: {e}")
    # Se a montagem falhar, o script não pode continuar.
    exit()


# Ferramenta de Download de Artigos da Biblioteca SOL

Definição do Caminho Onde Serão Salvos os Artigos

In [None]:
BASE_PATH = "/content/drive/MyDrive/"

Módulo Principal

In [None]:
def sanitize_filename(filename):
    """Remove caracteres inválidos de um nome de arquivo."""
    sanitized = re.sub(r'[^\w\s\-\.]', '', filename.strip())
    sanitized = re.sub(r'[-\s]+', '_', sanitized)
    return sanitized

def extract_event_acronym(url):
    """Extrai a sigla do evento da URL."""
    match = re.search(r'index.php/(\w+)(?:/|$)', url)
    return match.group(1).upper() if match else "EVENTO_DESCONHECIDO"

def create_output_folder(event_acronym):
    """Cria e retorna o caminho da pasta de saída."""
    output_folder = os.path.join(BASE_PATH, event_acronym)
    os.makedirs(output_folder, exist_ok=True)
    return output_folder

def fetch_article_links(url):
    """Busca os links dos artigos na página principal do evento."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.find_all('div', class_='obj_article_summary')

def download_article(article_div, output_folder, index):
    """Realiza o download de um único artigo em PDF."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    title_tag = article_div.find('div', class_='title')
    title = title_tag.get_text(strip=True) if title_tag else f"artigo_sem_titulo_{index}"
    link_tag = article_div.select_one('a.obj_galley_link.pdf')

    if not (link_tag and link_tag.has_attr('href')):
        return

    reader_url = link_tag['href']
    if '/view/' not in reader_url:
        return

    download_url = reader_url.replace('/view/', '/download/')
    filename = sanitize_filename(title) + ".pdf"
    filepath = os.path.join(output_folder, filename)

    try:
        response = requests.get(download_url, headers=headers)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            f.write(response.content)
        time.sleep(0.5)
    except requests.exceptions.RequestException:
        return

def download_sbc_articles(url):
    """Fluxo principal de download dos artigos de um evento da SBC."""
    event_acronym = extract_event_acronym(url)
    print(f"Evento: {event_acronym}")

    output_folder = create_output_folder(event_acronym)
    print(f"Pasta destino: {output_folder}")

    try:
        articles = fetch_article_links(url)
        if not articles:
            print("Nenhum artigo encontrado.")
            return

        print(f"Total de artigos encontrados: {len(articles)}\nIniciando download...")
        for i, article in enumerate(tqdm(articles, desc="Baixando artigos")):
            download_article(article, output_folder, i)

        print("\nDownload concluído.")
    except requests.exceptions.RequestException as e:
        print(f"Erro ao acessar a página: {e}")


Iniciar Execução

In [None]:

if __name__ == "__main__":
    # Substitua pela URL do evento desejado
    proceedings_url = "https://sol.sbc.org.br/index.php/kdmile"
    download_sbc_articles(proceedings_url)
