In [1]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
from urllib.parse import urljoin
from playwright.async_api import async_playwright

# ---- tickers ----
TICKERS = [
    "ABJC","BICB","BICC","BNBC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CABC","CBIBF","CFAC","CIEC","ECOC","ETIT","FTSC","LNBB","NEIC","NSBC",
    "NTLC","ONTBF","ORAC","ORGT","PALC","PRSC","SAFC","SCRC","SDCC","SDSC",
    "SEMC","SGBC","SHEC","SIBC","SICC","SIVC","SLBC","SMBC","SNTS","SOGC",
    "SPHC","STAC","STBC","SVOC","TTLC","TTLS","UNLC","UNXC"
]

BASE = "https://www.richbourse.com"
DETAIL_URL = BASE + "/common/apprendre/details-societe/{ticker}"
UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/126.0 Safari/537.36")

HEADER_TAGS = ("h1","h2","h3","h4","h5","h6","strong","b")

def norm(s: str) -> str:
    return (s or "").replace("\xa0", " ").replace("\u2019", "'").strip()

def clean_join(lines):
    return re.sub(r"\s{2,}", " ", " ".join([norm(x) for x in lines if norm(x)]))

def target_content(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Restreint au bloc central (en dessous du H1 de la société).
    """
    h1 = soup.find(lambda t: getattr(t, "name", "") in ("h1","h2") and "Détails" not in t.get_text())
    # fallback: bloc contenant "Société :" quelque part
    if not h1:
        node = soup.find(lambda t: getattr(t, "name", "") in ("div","section","article") and "Société :" in t.get_text())
        return node or soup
    # prendre sa section/parente proche
    for anc in h1.parents:
        if getattr(anc, "name", "") in ("section","div","main"):
            return anc
    return soup

def extract_pairs(content: BeautifulSoup) -> dict:
    """
    Robustesse ++ : cherche les paires sous 2 formes :
    1) <p><strong>Clé :</strong> Valeur</p>
    2) Texte brut "Clé : Valeur" dans <p>/<div>/<li>
    """
    wanted = {
        "Société": "Societe",
        "Secteur d'activité": "Secteur d'activité",
        "Pays": "Pays",
        "Introduction à la BRVM": "Introduction à la BRVM",
        "Nombre de titres": "Nombre de titres",
        "Flottant": "Flottant",
        "Site Web": "Site Web",
        "Téléphone": "Téléphone",
    }
    out = {v: "" for v in wanted.values()}

    # 1) patron <strong>clé</strong> valeur
    for p in content.find_all("p"):
        st = p.find("strong")
        if not st:
            continue
        key = norm(st.get_text()).rstrip(":")
        if key in wanted:
            val = norm(p.get_text(" ", strip=True))
            sval = norm(st.get_text())
            if val.startswith(sval):
                val = norm(val[len(sval):].lstrip(":"))
            out[wanted[key]] = val

    # 2) texte brut clé: valeur
    nodes = list(content.find_all(["p","div","li"]))
    for n in nodes:
        txt = norm(n.get_text(" ", strip=True))
        for k, mapped in wanted.items():
            # Ex.: "Société : xxx"
            m = re.search(rf"\b{k}\s*:\s*(.+)", txt)
            if m and not out[mapped]:
                out[mapped] = norm(m.group(1))

    return out

def section_text(content: BeautifulSoup, titles, stops):
    """
    Lit le texte après un titre (Présentation / Déterminants...) via next_siblings
    jusqu'au prochain titre stop.
    """
    titles = tuple(norm(t) for t in titles)
    stops  = tuple(norm(s) for s in stops)
    header = content.find(lambda t: getattr(t, "name","") in HEADER_TAGS and any(tv in norm(t.get_text()) for tv in titles))
    if not header:
        return ""
    out = []
    for sib in header.next_siblings:
        if isinstance(sib, NavigableString):
            continue
        name = getattr(sib, "name", "")
        if name in HEADER_TAGS and any(st in norm(sib.get_text()) for st in stops):
            break
        if name in ("p","ul","ol","div","section","article"):
            if name in ("ul","ol"):
                out.extend([li.get_text(" ", strip=True) for li in sib.find_all("li")])
            else:
                out.append(sib.get_text(" ", strip=True))
    return clean_join(out)

def extract_actionnaires(content: BeautifulSoup) -> str:
    """
    Capte 'Nom (xx,yy%)' même si le nom contient des parenthèses (ex: Public (BRVM)).
    """
    text = content.get_text("\n", strip=True)
    items = re.findall(r"[A-Za-zÀ-ÿ0-9\-\.'& ()]+?\(\d+[,\.]?\d*%\)", text)
    seen, uniq = set(), []
    for it in map(norm, items):
        if it not in seen:
            uniq.append(it); seen.add(it)
    return "; ".join(uniq)

def extract_logo(content: BeautifulSoup) -> str:
    """
    Prend de préférence un logo dans /ressources/uploads/profil-societe/.
    Fallback: premier <img> non traqueur hors header/footer.
    """
    img = content.find("img", src=re.compile(r"/ressources/uploads/profil-societe/", re.I))
    if img:
        return urljoin(BASE, img["src"])
    for img in content.find_all("img", src=True):
        src = img["src"].lower()
        if any(bad in src for bad in ["cleardot","google","analytics","rb_logo_long"]):
            continue
        # ignorer 1x1
        w = (img.get("width") or "").strip()
        h = (img.get("height") or "").strip()
        if w.isdigit() and h.isdigit() and (w == "1" or h == "1"):
            continue
        return urljoin(BASE, img["src"])
    return ""

def parse_fractionnements(content: BeautifulSoup, ticker: str) -> pd.DataFrame:
    for tbl in content.find_all("table"):
        thead = tbl.find("thead")
        if not thead:
            continue
        headers = [norm(th.get_text()) for th in thead.find_all("th")]
        if "Date" in headers and ("Parité" in headers or "Parite" in headers):
            tbody = tbl.find("tbody")
            if not tbody:
                continue
            rows = []
            for tr in tbody.find_all("tr"):
                tds = tr.find_all("td")
                if not tds:
                    continue
                date = norm(tds[0].get_text(" ", strip=True)) if len(tds)>0 else ""
                parite = norm(tds[1].get_text(" ", strip=True)) if len(tds)>1 else ""
                file_url = ""
                if len(tds) > 2:
                    a = tds[2].find("a", href=True)
                    if a:
                        file_url = urljoin(BASE, a["href"])
                rows.append({"Ticker": ticker, "Date": date, "Parite": parite, "FichierURL": file_url})
            return pd.DataFrame(rows)
    return pd.DataFrame()

async def robust_goto(page, url):
    await page.goto(url, timeout=60000)
    await page.wait_for_load_state("domcontentloaded")
    try:
        await page.wait_for_load_state("networkidle", timeout=8000)
    except:
        pass

async def scrape_one(page, ticker):
    url = DETAIL_URL.format(ticker=ticker)
    await robust_goto(page, url)
    soup = BeautifulSoup(await page.content(), "html.parser")

    content = target_content(soup)  # zone centrale seulement

    pairs = extract_pairs(content)
    presentation = section_text(
        content,
        titles=("Présentation",),
        stops=("Déterminants", "Derniers fractionnements", "Principaux actionnaires")
    )
    determinants = section_text(
        content,
        titles=("Déterminants Sectoriel","Déterminants sectoriel"),
        stops=("Derniers fractionnements","Principaux actionnaires","Présentation")
    )
    actionnaires = extract_actionnaires(content)
    logo = extract_logo(content)
    frac = parse_fractionnements(content, ticker)

    row = {
        "Ticker": ticker,
        "Societe": pairs.get("Societe",""),
        "Secteur d'activité": pairs.get("Secteur d'activité",""),
        "Pays": pairs.get("Pays",""),
        "Introduction à la BRVM": pairs.get("Introduction à la BRVM",""),
        "Nombre de titres": pairs.get("Nombre de titres",""),
        "Flottant": pairs.get("Flottant",""),
        "Site Web": pairs.get("Site Web",""),
        "Téléphone": pairs.get("Téléphone",""),
        "LogoURL": logo,
        "Presentation": presentation,
        "DeterminantsSectoriels": determinants,
        "ActionnairesPrincipaux": actionnaires,
        "SourceURL": url,
    }
    return row, frac

async def main():
    rows, fracs = [], []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, args=["--disable-blink-features=AutomationControlled"])
        context = await browser.new_context(user_agent=UA, locale="fr-FR")
        page = await context.new_page()

        for t in TICKERS:
            try:
                print("📘", t)
                r, df_frac = await scrape_one(page, t)
                rows.append(r)
                if not df_frac.empty:
                    fracs.append(df_frac)
                time.sleep(0.25)
            except Exception as e:
                print(f"❌ {t}: {e}")
                rows.append({
                    "Ticker": t, "Societe":"", "Secteur d'activité":"", "Pays":"",
                    "Introduction à la BRVM":"", "Nombre de titres":"", "Flottant":"",
                    "Site Web":"", "Téléphone":"", "LogoURL":"",
                    "Presentation":"", "DeterminantsSectoriels":"",
                    "ActionnairesPrincipaux":"", "SourceURL": DETAIL_URL.format(ticker=t)
                })

        await browser.close()

    # détails sociétés
    cols = ["Ticker","Societe","Secteur d'activité","Pays","Introduction à la BRVM",
            "Nombre de titres","Flottant","Site Web","Téléphone","LogoURL",
            "Presentation","DeterminantsSectoriels","ActionnairesPrincipaux","SourceURL"]
    df = pd.DataFrame(rows)
    for c in cols:
        if c not in df.columns:
            df[c] = ""
    df = df[cols]
    df.to_csv("societes_details.csv", index=False)
    print("✅ societes_details.csv écrit")

    # fractionnements
    if fracs:
        dff = pd.concat(fracs, ignore_index=True)
        dff.to_csv("societes_fractionnements.csv", index=False)
        print("✅ societes_fractionnements.csv écrit")
    else:
        print("ℹ️ Aucun fractionnement trouvé")

# Notebook/Jupyter :
import asyncio
await main()


📘 ABJC
📘 BICB
📘 BICC
📘 BNBC
📘 BOAB
📘 BOABF
📘 BOAC
📘 BOAM
📘 BOAN
📘 BOAS
📘 CABC
📘 CBIBF
📘 CFAC
📘 CIEC
📘 ECOC
📘 ETIT
📘 FTSC
📘 LNBB
📘 NEIC
📘 NSBC
📘 NTLC
📘 ONTBF
📘 ORAC
📘 ORGT
📘 PALC
📘 PRSC
📘 SAFC
📘 SCRC
📘 SDCC
📘 SDSC
📘 SEMC
📘 SGBC
📘 SHEC
📘 SIBC
📘 SICC
📘 SIVC
📘 SLBC
📘 SMBC
📘 SNTS
📘 SOGC
📘 SPHC
📘 STAC
📘 STBC
📘 SVOC
📘 TTLC
📘 TTLS
📘 UNLC
📘 UNXC
✅ societes_details.csv écrit
ℹ️ Aucun fractionnement trouvé
