# Imports e config

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions.wheel_input import ScrollOrigin
from dataclasses import dataclass, fields, asdict

import time

In [35]:
driver = webdriver.Chrome()
driver.implicitly_wait(4)
wait = WebDriverWait(driver, 10)

# Config site

In [36]:
URL_BASE = 'https://www.transparencia.sc.gov.br/remuneracao-servidores'
driver.get(URL_BASE)

In [None]:
class Scraping():
    ''' Classe para alternar meses e p√°ginas '''
    
    def __init__(self,ano):
        ''' Scraping do ano escolhido '''
        self.quantidade_paginas = 0
        self.ano = ano
    
    def quantidade_mes():
        '''
        Retorna a quantidade de meses
        '''
        mes = driver.find_element(By.CSS_SELECTOR, '#select-mes').get_property('children')
        return len(mes)
    
    def selecionar_mes(self, index):
        '''
        Seleciona o m√™s de acordo com o index e retorna qual m√™s foi selecionado
        '''
        mes = Select(driver.find_element(By.CSS_SELECTOR, '#select-mes'))
        mes.select_by_index(index)
        mes_atual = driver.find_element(By.CSS_SELECTOR, '#select-mes').get_property('children')[index].text
        return mes_atual
    
    def selecionar_ano(self):
        '''
        Seleciona o ano de acordo com a string espec√≠ficada ex: "2025"
        '''
        ano = Select(driver.find_element(By.CSS_SELECTOR, '#select-ano'))
        print(f'Ano selecionado: {self.ano}')
        ano.select_by_visible_text(f'{self.ano}')
    
    def botao_buscar(self):
        ''' Clica em buscar servidores '''
        
        lupa = driver.find_element(By.CSS_SELECTOR, '.botao-buscar-servidor')
        ActionChains(driver).scroll_to_element(lupa).perform()
        lupa.click()
    
    
    
    


In [19]:
scrapy = Scraping(2025)
scrapy.selecionar_mes(1)

'Fevereiro'

In [33]:
teste = Scraping(2025)

In [34]:
teste.botao_buscar()

# API SITE

O site cont√©m API para os dados podemos coletar dados com eles 

In [41]:
import requests
import pandas as pd
import time
import random
import os
import json
from datetime import datetime
from fake_useragent import UserAgent

# Configura√ß√µes de Caminho
BASE_DIR = os.path.join("data", "base")
LOG_FILE = os.path.join(BASE_DIR, "scraping_log_2025.txt")
BASE_URL = "https://api-portal-transparencia.apps.sm.okd4.ciasc.sc.gov.br/api/remuneracao-servidores/analise-detalhada"
PAGES_PER_CHECKPOINT = 50

# Garante que a pasta existe
os.makedirs(BASE_DIR, exist_ok=True)

def get_status_progresso():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, "r") as f:
            lines = f.readlines()
            if lines:
                last_line = lines[-1].strip()
                if "FIM:" in last_line:
                    partes = last_line.split("|")
                    mes = partes[0].split("Mes")[-1].strip()
                    pag = partes[1].split("Pagina")[-1].strip()
                    return mes, int(pag)
    return "01", 0

def save_to_csv(data_list, mes):
    filename = os.path.join(BASE_DIR, f"transparencia_sc_2025_{mes}.csv")
    df = pd.DataFrame(data_list)
    is_new = not os.path.exists(filename)
    df.to_csv(filename, mode='a', index=False, header=is_new, encoding='utf-8-sig')

def run_scraper():
    ua = UserAgent()
    session = requests.Session()
    
    ultimo_mes_log, ultima_pag_log = get_status_progresso()
    meses_2025 = [str(m).zfill(2) for m in range(1, 13)]
    meses_restantes = meses_2025[meses_2025.index(ultimo_mes_log):]

    print(f"{'='*70}")
    print(f"üöÄ SCRAPER EM EXECU√á√ÉO - DESTINO: /data/base/")
    print(f"üìç Status: Iniciando em M√™s {ultimo_mes_log} | P√°gina {ultima_pag_log}")
    print(f"{'='*70}")

    try:
        for mes in meses_restantes:
            current_page = (ultima_pag_log + 1) if mes == ultimo_mes_log else 1
            last_page_limit = None
            temp_storage = []
            
            while True:
                headers = {
                    "User-Agent": ua.random,
                    "Referer": "https://www.transparencia.sc.gov.br/",
                    "Accept": "application/json"
                }
                
                params = {
                    "ano": 2025,
                    "mes": mes,
                    "page": current_page,
                    "filtro": json.dumps({"TIPOCONSULTA": "SERVIDORES", "nome": ""}),
                    "sort": json.dumps({"field": "ServidorNomeSort", "dir": "asc"})
                }

                try:
                    t_start = time.time()
                    response = session.get(BASE_URL, params=params, headers=headers, timeout=30)
                    response.raise_for_status()
                    json_data = response.json()
                    
                    if last_page_limit is None:
                        last_page_limit = json_data.get('last_page', 1)
                        print(f"\n[INFO] M√™s {mes}/2025: {last_page_limit} p√°ginas encontradas.")

                    batch = json_data.get('data', [])
                    
                    # INJE√á√ÉO DE DADOS (M√™s e Ano)
                    for item in batch:
                        item['ref_mes'] = mes
                        item['ref_ano'] = 2025

                    temp_storage.extend(batch)
                    
                    # Dashboard de progresso no terminal
                    print(f"üì° [M√™s {mes}] P√°g: {current_page}/{last_page_limit} | "
                          f"Itens: {len(batch)} | Time: {time.time() - t_start:.2f}s", end='\r')

                    # Salva a cada X p√°ginas ou na √∫ltima do m√™s
                    if current_page % PAGES_PER_CHECKPOINT == 0 or current_page == last_page_limit:
                        save_to_csv(temp_storage, mes)
                        
                        with open(LOG_FILE, "a") as f:
                            f.write(f"FIM: Mes {mes} | Pagina {current_page}\n")
                        
                        print(f"\nüíæ CHECKPOINT: M√™s {mes}, P√°g {current_page} salva em /data/base/")
                        temp_storage = []

                    if current_page >= last_page_limit:
                        print(f"\n‚úÖ M√™s {mes} finalizado!")
                        break
                    
                    current_page += 1
                    time.sleep(random.uniform(0, 0.5))

                except Exception as e:
                    print(f"\nüö® Erro na P√°g {current_page}: {str(e)[:100]}")
                    time.sleep(30)
                    continue
            
            ultima_pag_log = 0 

    except KeyboardInterrupt:
        print(f"\n\nüõë Interrompido pelo usu√°rio.")
    finally:
        if temp_storage:
            save_to_csv(temp_storage, mes)
        print(f"\nüèÅ Fim da execu√ß√£o. Logs em: {LOG_FILE}")

if __name__ == "__main__":
    run_scraper()

üöÄ SCRAPER EM EXECU√á√ÉO - DESTINO: /data/base/
üìç Status: Iniciando em M√™s 01 | P√°gina 450

[INFO] M√™s 01/2025: 7158 p√°ginas encontradas.
üì° [M√™s 01] P√°g: 500/7158 | Itens: 20 | Time: 0.30s
üíæ CHECKPOINT: M√™s 01, P√°g 500 salva em /data/base/
üì° [M√™s 01] P√°g: 550/7158 | Itens: 20 | Time: 0.43s
üíæ CHECKPOINT: M√™s 01, P√°g 550 salva em /data/base/
üì° [M√™s 01] P√°g: 600/7158 | Itens: 20 | Time: 0.32s
üíæ CHECKPOINT: M√™s 01, P√°g 600 salva em /data/base/
üì° [M√™s 01] P√°g: 650/7158 | Itens: 20 | Time: 0.29s
üíæ CHECKPOINT: M√™s 01, P√°g 650 salva em /data/base/
üì° [M√™s 01] P√°g: 700/7158 | Itens: 20 | Time: 0.28s
üíæ CHECKPOINT: M√™s 01, P√°g 700 salva em /data/base/
üì° [M√™s 01] P√°g: 750/7158 | Itens: 20 | Time: 0.39s
üíæ CHECKPOINT: M√™s 01, P√°g 750 salva em /data/base/
üì° [M√™s 01] P√°g: 791/7158 | Itens: 20 | Time: 0.24s

üõë Interrompido pelo usu√°rio.

üèÅ Fim da execu√ß√£o. Logs em: data\base\scraping_log_2025.txt


In [None]:
import requests
import pandas as pd
import time
import random
import os
import json
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent

# Configura√ß√µes de Caminho
BASE_DIR = os.path.join("data", "details")
os.makedirs(BASE_DIR, exist_ok=True)
BASE_URL = "https://api-portal-transparencia.apps.sm.okd4.ciasc.sc.gov.br/api/remuneracao-servidores/analise-detalhada"

def get_last_page_from_log(mes):
    """L√™ o progresso espec√≠fico de um m√™s."""
    log_path = os.path.join(BASE_DIR, f"log_mes_{mes}.txt")
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            line = f.read().strip()
            if line:
                return int(line)
    return 0

def salvar_checkpoint_mes(mes, pagina):
    """Registra a √∫ltima p√°gina processada para evitar recome√ßo."""
    log_path = os.path.join(BASE_DIR, f"log_mes_{mes}.txt")
    with open(log_path, "w") as f:
        f.write(str(pagina))

def coletar_mes(mes):
    """Fun√ß√£o que ser√° executada em paralelo para cada m√™s."""
    ua = UserAgent()
    session = requests.Session()
    
    ultima_pag = get_last_page_from_log(mes)
    current_page = ultima_pag + 1
    last_page_limit = None
    temp_storage = []
    
    # Print inicial detalhado
    status = "RETOMANDO" if ultima_pag > 0 else "INICIANDO"
    print(f"üì° [{status}] M√™s {mes}/2025 | Come√ßando da P√°g: {current_page}")

    while True:
        headers = {
            "User-Agent": ua.random,
            "Referer": "https://www.transparencia.sc.gov.br/",
            "Accept": "application/json"
        }
        
        params = {
            "ano": 2025,
            "mes": mes,
            "page": current_page,
            "filtro": json.dumps({"TIPOCONSULTA": "SERVIDORES", "nome": ""}),
            "sort": json.dumps({"field": "ServidorNomeSort", "dir": "asc"})
        }

        try:
            t_start = time.time()
            response = session.get(BASE_URL, params=params, headers=headers, timeout=30)
            response.raise_for_status()
            json_data = response.json()
            t_end = time.time()
            
            if last_page_limit is None:
                last_page_limit = json_data.get('last_page', 1)
                # Verifica se o m√™s j√° estava completo
                if current_page > last_page_limit:
                    print(f"‚úÖ [M√™s {mes}] J√° estava 100% conclu√≠do anteriormente.")
                    break

            batch = json_data.get('data', [])
            
            # Inje√ß√£o de metadados para sua an√°lise como Data Analyst
            for item in batch:
                item['ref_mes'] = mes
                item['ref_ano'] = 2025
            
            temp_storage.extend(batch)

            # Print de andamento por p√°gina
            prog = (current_page / last_page_limit) * 100
            print(f"‚ñ∂Ô∏è [M√™s {mes}] P√°g: {current_page}/{last_page_limit} ({prog:.1f}%) | Lat√™ncia: {t_end - t_start:.2f}s")

            # Checkpoint a cada 50 p√°ginas
            if current_page % 50 == 0 or current_page == last_page_limit:
                filename = os.path.join(BASE_DIR, f"transparencia_sc_2025_{mes}.csv")
                df = pd.DataFrame(temp_storage)
                df.to_csv(filename, mode='a', index=False, header=not os.path.exists(filename), encoding='utf-8-sig')
                
                salvar_checkpoint_mes(mes, current_page)
                
                print(f"üíæ [M√™s {mes}] CHECKPOINT: {len(temp_storage)} itens salvos. (Hora: {datetime.now().strftime('%H:%M:%S')})")
                temp_storage = [] 

            if current_page >= last_page_limit:
                print(f"üèÜ [M√™s {mes}] FINALIZADO COM SUCESSO!")
                break
            
            current_page += 1
            # Delay agressivo conforme seu teste bem-sucedido
            time.sleep(random.uniform(1, 2.5)) 

        except Exception as e:
            print(f"üö® [M√™s {mes}] ERRO na P√°g {current_page}: {str(e)[:100]}")
            time.sleep(20) # Pausa para o servidor respirar
            continue

def run_multithreaded():
    meses = [str(m).zfill(2) for m in range(1, 13)]
    
    print(f"{'='*70}")
    print(f"üî• INICIANDO EXTRA√á√ÉO PARALELA (4 THREADS) - AGRESSIVE MODE")
    print(f"üìÅ Destino: {BASE_DIR}")
    print(f"{'='*70}")
    
    # max_workers=4 √© o ideal para o seu processador Intel equilibrar velocidade e estabilidade
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(coletar_mes, meses)

if __name__ == "__main__":
    start_total = time.time()
    try:
        run_multithreaded()
    except KeyboardInterrupt:
        print(f"\nüõë Interrup√ß√£o manual detectada pelo usu√°rio.")
    finally:
        print(f"\n{'='*70}")
        print(f"‚ú® TUDO PRONTO! Tempo Total: {time.time() - start_total:.2f}s")
        print(f"{'='*70}")

üî• INICIANDO EXTRA√á√ÉO PARALELA (4 THREADS) - AGRESSIVE MODE
üìÅ Destino: data\details
üì° [RETOMANDO] M√™s 04/2025 | Come√ßando da P√°g: 151
üì° [RETOMANDO] M√™s 03/2025 | Come√ßando da P√°g: 151
üì° [RETOMANDO] M√™s 02/2025 | Come√ßando da P√°g: 151
üì° [RETOMANDO] M√™s 01/2025 | Come√ßando da P√°g: 151
‚ñ∂Ô∏è [M√™s 02] P√°g: 151/7994 (1.9%) | Lat√™ncia: 0.48s
‚ñ∂Ô∏è [M√™s 01] P√°g: 151/7158 (2.1%) | Lat√™ncia: 0.48s
‚ñ∂Ô∏è [M√™s 04] P√°g: 151/9153 (1.6%) | Lat√™ncia: 0.49s
‚ñ∂Ô∏è [M√™s 03] P√°g: 151/8886 (1.7%) | Lat√™ncia: 0.49s
‚ñ∂Ô∏è [M√™s 02] P√°g: 152/7994 (1.9%) | Lat√™ncia: 0.20s
‚ñ∂Ô∏è [M√™s 01] P√°g: 152/7158 (2.1%) | Lat√™ncia: 0.17s
‚ñ∂Ô∏è [M√™s 03] P√°g: 152/8886 (1.7%) | Lat√™ncia: 0.29s
‚ñ∂Ô∏è [M√™s 04] P√°g: 152/9153 (1.7%) | Lat√™ncia: 0.19s
‚ñ∂Ô∏è [M√™s 01] P√°g: 153/7158 (2.1%) | Lat√™ncia: 0.15s
‚ñ∂Ô∏è [M√™s 02] P√°g: 153/7994 (1.9%) | Lat√™ncia: 0.17s
‚ñ∂Ô∏è [M√™s 04] P√°g: 153/9153 (1.7%) | Lat√™ncia: 0.18s
‚ñ∂Ô∏è [M√™s 03] P√°g: 153/8886 (1.7%) | Lat√™n