In [1]:
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

TICKERS = [
    "ABJC","BICB","BICC","BNBC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CABC","CBIBF","CFAC","CIEC","ECOC","ETIT","FTSC","LNBB","NEIC","NSBC",
    "NTLC","ONTBF","ORAC","ORGT","PALC","PRSC","SAFC","SCRC","SDCC","SDSC",
    "SEMC","SGBC","SHEC","SIBC","SICC","SIVC","SLBC","SMBC","SNTS","SOGC",
    "SPHC","STAC","STBC","SVOC","TTLC","TTLS","UNLC","UNXC"
]

UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/126.0 Safari/537.36")

def clean_num(x: str) -> str:
    if not x:
        return ""
    x = re.split(r"\(", x)[0]
    x = x.replace("%", "")
    x = x.replace("\xa0", " ").replace(" ", "")
    x = x.replace(",", ".")
    x = x.strip()
    if x in {"-", "–", "—"}:
        return ""
    return x

def norm_text(s: str) -> str:
    # normalise espaces et apostrophe typographique
    return s.replace("\xa0", " ").replace("\u2019", "'").strip()

def parse_all_tables(html: str) -> list[pd.DataFrame]:
    soup = BeautifulSoup(html, "html.parser")
    dfs = []
    for tbl in soup.find_all("table"):
        thead = tbl.find("thead")
        headers = [norm_text(th.get_text(strip=True)) for th in thead.find_all("th")] if thead else []
        tbody = tbl.find("tbody")
        if not tbody: 
            continue
        rows = []
        for tr in tbody.find_all("tr"):
            tds = [norm_text(td.get_text(" ", strip=True)) for td in tr.find_all(["td","th"])]
            if tds:
                rows.append(tds)
        if not rows:
            continue
        if not headers:
            headers = [norm_text(c) for c in rows[0]]
            rows = rows[1:]
        w = min(len(headers), max((len(r) for r in rows), default=0))
        if w == 0:
            continue
        headers = headers[:w]
        rows = [r[:w] for r in rows]
        try:
            dfs.append(pd.DataFrame(rows, columns=headers))
        except Exception:
            continue
    return dfs

async def robust_goto(page, url: str):
    await page.goto(url, timeout=60000)
    await page.wait_for_load_state("domcontentloaded")
    if "/investisseur/profile" in page.url:
        await page.goto(url, timeout=60000)
        await page.wait_for_load_state("domcontentloaded")
    try:
        await page.wait_for_load_state("networkidle", timeout=10000)
    except:
        pass

# -------- VALORISATION (historique seulement) --------
async def scrape_valorisation_histo(page, ticker: str) -> pd.DataFrame:
    url = f"https://www.richbourse.com/investisseur/analyse-societe/valorisation/{ticker}"
    await robust_goto(page, url)
    html = await page.content()
    dfs = parse_all_tables(html)
    if not dfs:
        return pd.DataFrame()
    target = None
    for df in dfs:
        cols = set(df.columns)
        if "Année" in cols and ({"PER Action","PER Secteur","PBR Action","PBR Secteur"} & cols):
            target = df.copy()
            break
    if target is None:
        return pd.DataFrame()
    keep = [c for c in ["Année","PER Action","PER Secteur","PBR Action","PBR Secteur"] if c in target.columns]
    target = target[keep]
    for c in [k for k in keep if k != "Année"]:
        target[c] = target[c].map(clean_num)
    target["Ticker"] = ticker
    return target

# -------- PERFORMANCES (cliquer l’onglet ROE, Gearing, etc.) --------
async def scrape_performances(page, ticker: str) -> pd.DataFrame:
    url = f"https://www.richbourse.com/investisseur/analyse-societe/performances-ratios/{ticker}"
    await robust_goto(page, url)

    # clique explicitement sur l’onglet “ROE, Gearing, etc.”
    try:
        await page.get_by_text("ROE, Gearing, etc.", exact=True).click(timeout=5000)
        try:
            await page.wait_for_load_state("networkidle", timeout=4000)
        except:
            pass
    except:
        pass  # si déjà actif

    html = await page.content()
    dfs = parse_all_tables(html)
    if not dfs:
        return pd.DataFrame()

    target = None
    for df in dfs:
        cols = {norm_text(c) for c in df.columns}
        # attention à l’apostrophe → déjà normalisé par norm_text
        if ("Exercice" in cols or "Année" in cols) and (
            {"Gearing","Marge d'exploitation","Taux de profitabilité","ROE"} & cols
        ):
            target = df.copy()
            break
    if target is None:
        return pd.DataFrame()

    if "Année" not in target.columns and "Exercice" in target.columns:
        target = target.rename(columns={"Exercice": "Année"})

    keep = [c for c in ["Année","Gearing","Marge d'exploitation","Taux de profitabilité","ROE"] if c in target.columns]
    target = target[keep]
    for c in [k for k in keep if k != "Année"]:
        target[c] = target[c].map(clean_num)
    target["Ticker"] = ticker
    return target

async def main():
    out = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, args=["--disable-blink-features=AutomationControlled"])
        context = await browser.new_context(storage_state="cookies.json", user_agent=UA, locale="fr-FR")
        page = await context.new_page()

        for t in TICKERS:
            try:
                print(f"📊 {t} ...")
                df_valo = await scrape_valorisation_histo(page, t)
                df_perf = await scrape_performances(page, t)

                if df_valo.empty and df_perf.empty:
                    out.append(pd.DataFrame([{
                        "Ticker": t, "Année": "", "PER Action": "", "PER Secteur": "",
                        "PBR Action": "", "PBR Secteur": "", "Gearing": "",
                        "Marge d'exploitation": "", "Taux de profitabilité": "", "ROE": "",
                        "Statut": "Aucune donnée"
                    }]))
                else:
                    if df_valo.empty:
                        merged = df_perf.copy()
                    elif df_perf.empty:
                        merged = df_valo.copy()
                    else:
                        merged = pd.merge(df_valo, df_perf, on=["Ticker","Année"], how="outer")
                    merged["Statut"] = ""
                    out.append(merged)

                time.sleep(0.35)

            except Exception as e:
                print(f"❌ {t}: {e}")
                out.append(pd.DataFrame([{
                    "Ticker": t, "Année": "", "PER Action": "", "PER Secteur": "",
                    "PBR Action": "", "PBR Secteur": "", "Gearing": "",
                    "Marge d'exploitation": "", "Taux de profitabilité": "", "ROE": "",
                    "Statut": f"Erreur: {e}"
                }]))

        await browser.close()

    final_df = pd.concat(out, ignore_index=True) if out else pd.DataFrame()
    cols = ["Ticker","Année","PER Action","PER Secteur","PBR Action","PBR Secteur",
            "Gearing","Marge d'exploitation","Taux de profitabilité","ROE","Statut"]
    for c in cols:
        if c not in final_df.columns:
            final_df[c] = ""
    final_df = final_df[cols]
    final_df.to_csv("valorisation_performances.csv", index=False)
    print("✅ Fichier généré : valorisation_performances.csv")

# Exécuter dans Jupyter
await main()


📊 ABJC ...
📊 BICB ...
📊 BICC ...
📊 BNBC ...
📊 BOAB ...
📊 BOABF ...
📊 BOAC ...
📊 BOAM ...
📊 BOAN ...
📊 BOAS ...
📊 CABC ...
📊 CBIBF ...
📊 CFAC ...
📊 CIEC ...
📊 ECOC ...
📊 ETIT ...
📊 FTSC ...
📊 LNBB ...
📊 NEIC ...
📊 NSBC ...
📊 NTLC ...
📊 ONTBF ...
📊 ORAC ...
📊 ORGT ...
📊 PALC ...
📊 PRSC ...
📊 SAFC ...
📊 SCRC ...
📊 SDCC ...
📊 SDSC ...
📊 SEMC ...
📊 SGBC ...
📊 SHEC ...
📊 SIBC ...
📊 SICC ...
📊 SIVC ...
📊 SLBC ...
📊 SMBC ...
📊 SNTS ...
📊 SOGC ...
📊 SPHC ...
📊 STAC ...
📊 STBC ...
📊 SVOC ...
📊 TTLC ...
📊 TTLS ...
📊 UNLC ...
📊 UNXC ...
✅ Fichier généré : valorisation_performances.csv
