In [1]:
# b3_iframe_full_extractor.py
import requests
import pandas as pd
import re
import time
import csv
from io import StringIO
from bs4 import BeautifulSoup

# Selenium imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    StaleElementReferenceException,
    ElementClickInterceptedException,
    TimeoutException,
)

In [2]:
TICKER_REGEX = re.compile(r'^[A-Z]{2,6}\d{1,2}$', re.I)
HEADER_KEYWORDS = {"code", "stock", "ativo", "código", "codigo", "symbol", "ticker"}

In [3]:
def normalize_token(tok: str):
    if not isinstance(tok, str):
        return None
    s = tok.strip().upper().split()[0]
    s = re.sub(r'[^A-Z0-9]', '', s)
    if TICKER_REGEX.match(s):
        return s + '.SA'
    return None

In [4]:

def try_download_csv_from_iframe(iframe_url, headers=None, timeout=10):
    """
    Tenta baixar CSV diretamente do conteúdo do iframe.
    Também procura na página por links que apontem para arquivos .csv (download).
    Retorna list de tickers se achou, senão None.
    """
    headers = headers or {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(iframe_url, headers=headers, timeout=timeout)
        r.raise_for_status()
    except Exception as e:
        print("requests ao iframe falhou:", e)
        return None

    # Se a resposta já for CSV (começa com "Code,"...), parse direto
    text = r.text
    if text.lstrip().startswith("Code,") or text.lstrip().splitlines()[0].lower().startswith("code,"):
        print("Encontrado CSV diretamente no iframe (conteúdo).")
        df = pd.read_csv(StringIO(text))
        if "Code" in df.columns or "code" in [c.lower() for c in df.columns]:
            col = next(c for c in df.columns if c.lower() == "code")
            tickers = [normalize_token(x) for x in df[col].astype(str)]
            tickers = [t for t in tickers if t]
            return list(dict.fromkeys(tickers))

    # Se HTML, procurar links <a> com href terminando em .csv ou com "composition"
    soup = BeautifulSoup(text, "lxml")
    links = soup.find_all("a", href=True)
    for a in links:
        href = a["href"]
        href_low = href.lower()
        # link relativo? montar absoluto
        if href_low.endswith(".csv") or "composition" in href_low and (".csv" in href_low or "download" in href_low or "composition" in href_low):
            # montar url absoluto se necessário
            if href.startswith("//"):
                href = "https:" + href
            elif href.startswith("/"):
                base = re.match(r'^(https?://[^/]+)', iframe_url)
                if base:
                    href = base.group(1) + href
            # tentar baixar
            try:
                rr = requests.get(href, headers=headers, timeout=timeout)
                rr.raise_for_status()
                txt = rr.text
                # tentar parsear CSV
                try:
                    df = pd.read_csv(StringIO(txt))
                except Exception:
                    # às vezes tem BOM ou encoding diferente
                    df = pd.read_csv(StringIO(txt.encode('utf-8').decode('utf-8', errors='ignore')))
                # detectar coluna de códigos
                col = None
                for c in df.columns:
                    if str(c).lower() in ("code","codigo","codigo ativo","codigo ativo","code "):
                        col = c; break
                if col is None:
                    # fallback: tentar primeira coluna
                    col = df.columns[0]
                tickers = [normalize_token(x) for x in df[col].astype(str)]
                tickers = [t for t in tickers if t]
                print(f"Download CSV via link bem-sucedido: {href}")
                return list(dict.fromkeys(tickers))
            except Exception as e:
                # continuar procurando outros links
                print("Falha ao baixar/parsar link csv:", href, e)
                continue
    return None

In [5]:
# ---------------- Selenium robust pagination fallback ----------------
def selenium_extract_all_from_iframe(iframe_url, headless=True, wait_seconds=10, debug=False):
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1200")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    try:
        driver.get(iframe_url)
        # esperar pela tabela (ou pelo conteúdo)
        try:
            WebDriverWait(driver, wait_seconds).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
        except TimeoutException:
            time.sleep(1.5)

        # Scroll para baixo para expor selects/pagination
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(0.8)

        # 1) tentar selecionar a maior page-size usando o texto da opção (não o value)
        selects = driver.find_elements(By.TAG_NAME, "select")
        if debug: print(f"[debug] selects encontrados: {len(selects)}")
        made_change = False
        for sel in selects:
            try:
                opts = sel.find_elements(By.TAG_NAME, "option")
                numeric = []
                for o in opts:
                    txt = (o.text or "").strip()
                    # extrair número do texto (ex: "20", "20 per page", "Show 100")
                    m = re.search(r'(\d+)', txt)
                    if m:
                        n = int(m.group(1))
                        numeric.append((n, o))
                if numeric:
                    numeric_sorted = sorted(numeric, key=lambda x: x[0], reverse=True)
                    max_n, option_elem = numeric_sorted[0]
                    # clique com JS (mais robusto) ou normal click
                    try:
                        option_elem.click()
                    except Exception:
                        driver.execute_script("arguments[0].selected = true; arguments[0].dispatchEvent(new Event('change'))", option_elem)
                    if debug: print(f"[debug] ajustei page-size para {max_n}")
                    made_change = True
                    time.sleep(1.2)
                    break
            except StaleElementReferenceException:
                continue

        # função para extrair tickers da tabela visível
        def extract_current():
            html = driver.page_source
            soup = BeautifulSoup(html, "lxml")
            tables = soup.find_all("table")
            for table in tables:
                # identificar por cabeçalho
                ths = [th.get_text(" ", strip=True).lower() for th in table.find_all("th")]
                if not ths:
                    first_tr = table.find("tr")
                    if first_tr:
                        ths = [td.get_text(" ", strip=True).lower() for td in first_tr.find_all(["td","th"])]
                joined = " ".join(ths)
                if any(k in joined for k in HEADER_KEYWORDS):
                    # usar pandas para garantir consistência
                    try:
                        df = pd.read_html(StringIO(str(table)))[0]
                        # localizar melhor coluna de código
                        code_col = None
                        for c in df.columns:
                            if any(k in str(c).lower() for k in HEADER_KEYWORDS):
                                code_col = c; break
                        if code_col is None:
                            # heurística: coluna com mais matches
                            best_col = None; best_count = 0
                            for c in df.columns:
                                vals = df[c].astype(str).fillna('').tolist()
                                cnt = sum(1 for v in vals if normalize_token(v))
                                if cnt > best_count:
                                    best_count = cnt; best_col = c
                            code_col = best_col
                        if code_col is None:
                            continue
                        vals = [normalize_token(x) for x in df[code_col].astype(str)]
                        return [v for v in vals if v]
                    except Exception:
                        continue
            if tables:
                try:
                    df = pd.read_html(StringIO(str(tables[0])))[0]
                    best_col = None; best_count = 0
                    for c in df.columns:
                        vals = df[c].astype(str).fillna('').tolist()
                        cnt = sum(1 for v in vals if normalize_token(v))
                        if cnt > best_count:
                            best_count = cnt; best_col = c
                    if best_col is not None:
                        vals = [normalize_token(x) for x in df[best_col].astype(str)]
                        return [v for v in vals if v]
                except Exception:
                    pass
            return []

        collected = []
        seen = set()

        # extrair página inicial
        page0 = extract_current()
        if debug: print(f"[debug] page0 count={len(page0)}")
        for t in page0:
            if t not in seen:
                seen.add(t); collected.append(t)

        # 2) tentar localizar paginação numerada
        # procurar por elementos comuns (li.paginate_button, a.page-link, ul.pagination li a)
        page_numbers = set()
        try:
            els = driver.find_elements(By.CSS_SELECTOR, "ul.pagination li, div.pagination li, nav.pagination li")
            for li in els:
                try:
                    a = li.find_element(By.TAG_NAME, "a")
                    txt = (a.text or "").strip()
                    if txt.isdigit():
                        page_numbers.add(int(txt))
                except Exception:
                    continue
        except Exception:
            pass

        try:
            anchors = driver.find_elements(By.TAG_NAME, "a")
            for a in anchors:
                try:
                    txt = (a.text or "").strip()
                    if txt.isdigit():
                        page_numbers.add(int(txt))
                except Exception:
                    continue
        except Exception:
            pass

        page_numbers = sorted(page_numbers)
        if debug: print(f"[debug] detected page numbers: {page_numbers}")

        # se achou números, iterar por eles
        if page_numbers:
            for p in page_numbers:
                if p == 1:
                    continue
                anchors = driver.find_elements(By.TAG_NAME, "a")
                clicked = False
                for a in anchors:
                    try:
                        txt = (a.text or "").strip()
                        if txt == str(p):
                            try:
                                a.click()
                            except (ElementClickInterceptedException, StaleElementReferenceException):
                                driver.execute_script("arguments[0].click();", a)
                            # aguardar nova tabela
                            try:
                                WebDriverWait(driver, wait_seconds).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
                            except Exception:
                                time.sleep(0.8)
                            time.sleep(0.5)
                            new = extract_current()
                            if debug: print(f"[debug] page {p} got {len(new)}")
                            for t in new:
                                if t not in seen:
                                    seen.add(t); collected.append(t)
                            clicked = True
                            break
                    except StaleElementReferenceException:
                        continue
                if not clicked and debug:
                    print(f"[debug] could not click page {p}")

        else:
            # se não há botões numerados, tentar clicar repetidamente em "next"
            if debug: print("[debug] no numeric pages; trying next loop")
            while True:
                found_next = None
                anchors = driver.find_elements(By.TAG_NAME, "a")
                for a in anchors:
                    try:
                        txt = (a.text or "").strip().lower()
                        if txt in (">", "»", "next", "próximo", "proximo"):
                            found_next = a
                            break
                    except Exception:
                        continue
                if not found_next:
                    break
                try:
                    found_next.click()
                except (ElementClickInterceptedException, StaleElementReferenceException):
                    driver.execute_script("arguments[0].click();", found_next)
                try:
                    WebDriverWait(driver, wait_seconds).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
                except Exception:
                    time.sleep(0.8)
                time.sleep(0.4)
                new = extract_current()
                added = 0
                for t in new:
                    if t not in seen:
                        seen.add(t); collected.append(t); added += 1
                if debug: print(f"[debug] next loop added {added}")
                if added == 0:
                    break

        # salvar CSV e retornar
        if collected:
            with open("tickers_ibrx100_full.csv", "w", newline="", encoding="utf-8") as f:
                writer = csv.writer(f)
                writer.writerow(["Ticker"])
                for t in collected:
                    writer.writerow([t])
            if debug: print(f"[debug] salvou {len(collected)} tickers")
            return collected
        else:
            return None

    finally:
        driver.quit()

In [6]:
# -------------- Orquestração --------------
def get_all_ibrx100_tickers(iframe_src="https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=en-us", try_index_codes=None, use_selenium_fallback=True, debug=False):
    """
    Tenta baixar CSV direto; se não, tenta requests/parsing; se não, usa Selenium para paginação.
    try_index_codes: lista de strings para substituir 'IBXX' placeholder (ex: ['IBRX100'])
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    urls_to_try = [iframe_src]
    if try_index_codes:
        for code in try_index_codes:
            urls_to_try.append(iframe_src.replace("IBXX", code))
    for url in urls_to_try:
        if debug: print("[debug] tentando URL:", url)
        try:
            tickers = try_download_csv_from_iframe(url, headers=headers)
            if tickers:
                print("✅ Extraído via download/parsing direto. Tickers:", len(tickers))
                return tickers
        except Exception as e:
            if debug: print("requests parse falhou:", e)
            pass

    # 2) fallback: Selenium
    if use_selenium_fallback:
        for url in urls_to_try:
            try:
                if debug: print("[debug] Selenium tentando:", url)
                res = selenium_extract_all_from_iframe(url, headless=not debug, wait_seconds=12, debug=debug)
                if res:
                    print("✅ Extraído via Selenium. Tickers:", len(res))
                    return res
            except Exception as e:
                if debug: print("Selenium tentativa falhou:", e)
                continue

    print("⚠️ Não foi possível extrair tickers por nenhum método.")
    return []

# USO
if __name__ == "__main__":
    # passe try_index_codes para substituir IBXX por IBRX100 se necessário
    tickers = get_all_ibrx100_tickers(
        iframe_src="https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=en-us",
        try_index_codes=["IBRX100","IBRX-100"],
        use_selenium_fallback=True,
        debug=True  # define False para headless e menos logs
    )
    print("TOTAL TICKERS:", len(tickers))
    print(tickers)

[debug] tentando URL: https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=en-us
[debug] tentando URL: https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBRX100?language=en-us
[debug] tentando URL: https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBRX-100?language=en-us
[debug] Selenium tentando: https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=en-us
[debug] selects encontrados: 2
[debug] ajustei page-size para 120
[debug] page0 count=97
[debug] detected page numbers: []
[debug] no numeric pages; trying next loop
[debug] salvou 97 tickers
✅ Extraído via Selenium. Tickers: 97
TOTAL TICKERS: 97
['ALOS3.SA', 'ABEV3.SA', 'ANIM3.SA', 'ASAI3.SA', 'AURE3.SA', 'AZZA3.SA', 'BBSE3.SA', 'BBDC3.SA', 'BBDC4.SA', 'BRAP4.SA', 'BBAS3.SA', 'BRKM5.SA', 'BRAV3.SA', 'BPAC11.SA', 'CXSE3.SA', 'CEAB3.SA', 'CMIG4.SA', 'COGN3.SA', 'CSMG3.SA', 'CPLE3.SA', 'CPLE6.SA', 'CSAN3.SA', 'CPFE3.SA', 'CMIN3.SA', 'CURY3.SA', 'CVCB3.SA', 'CYRE3.SA', 'DIRR3.SA', 'ECOR3.SA', 'EL