In [5]:
# -*- coding: utf-8 -*-
import re
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

TICKERS = [
    "ABJC","BICB"
]

BASE_URL = "https://www.richbourse.com/investisseur/rapport-activite/index/{ticker}/{type_}/{periode}"

def clean_cell(val: str) -> str:
    if not val:
        return ""
    val = re.split(r"\(", val)[0]        # coupe la variation (%)
    val = (val.replace("\xa0", " ")
              .replace(" ", "")
              .replace(",", ""))
    return val.strip()

def detect_unit_footer(soup: BeautifulSoup) -> str:
    patterns = [
        r"donn[ée]es?.*xof",
        r"donn[ée]es?.*francs?\s*cfa",
        r"donn[ée]es?.*milliers.*xof",
        r"en\s+milliers\s+de\s+xof"
    ]
    for el in soup.select("p, small, div, span, footer")[::-1]:
        txt = el.get_text(" ", strip=True)
        if not txt:
            continue
        for pat in patterns:
            if re.search(pat, txt, flags=re.I):
                return txt
    return ""

def period_label(type_: str, periode: str) -> str:
    t = type_.lower()
    if t.startswith("trimes"):
        return f"T{periode}"
    if t.startswith("semest"):
        return "S1" if periode == "1" else "S2"
    return "An"

async def scrape_report(page, ticker: str, type_: str, periode: str) -> pd.DataFrame:
    url = BASE_URL.format(ticker=ticker, type_=type_, periode=periode)
    await page.goto(url, timeout=60000)

    try:
        await page.wait_for_selector("table.table", timeout=8000)
    except:
        return pd.DataFrame()

    html_page = await page.content()
    soup_page = BeautifulSoup(html_page, "html.parser")
    table = soup_page.select_one("table.table")
    if table is None:
        return pd.DataFrame()

    # headers
    thead = table.find("thead")
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all("th")]
    else:
        first_tr = table.find("tbody").find("tr")
        headers = [td.get_text(strip=True) for td in first_tr.find_all("td")]

    if "Période" not in headers:
        headers = ["Période"] + headers

    # rows
    rows = []
    body = table.find("tbody")
    if body:
        for tr in body.find_all("tr"):
            tds = [td.get_text(" ", strip=True) for td in tr.find_all("td")]
            if not tds:
                continue
            if len(tds) == len(headers) - 1 and headers[0] == "Période":
                tds = [""] + tds  # si la période n'est pas rendue en colonne
            row = {headers[i]: clean_cell(tds[i]) for i in range(min(len(headers), len(tds)))}
            rows.append(row)

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    # note d'unité en bas de page
    unit_phrase = detect_unit_footer(soup_page)

    # colonnes demandées
    type_norm = "Annuel" if type_.lower().startswith("annuel") else ("Semestriel" if type_.lower().startswith("semest") else "Trimestriel")
    df["Ticker"] = ticker
    df["Type"] = type_norm
    df["Période_détail"] = period_label(type_norm, periode)
    df["en XOF"] = unit_phrase  # remplace "unité"

    # ordre colonnes
    front = ["Ticker", "Type", "Période_détail", "Période", "en XOF"]
    others = [c for c in df.columns if c not in front]
    return df[front + others]

async def main(output_csv: str = "rapport_brvm_complet.csv", headless: bool = True):
    all_parts = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)
        try:
            context = await browser.new_context(storage_state="cookies.json")
        except:
            context = await browser.new_context()
        page = await context.new_page()

        for ticker in TICKERS:
            print(f"📊 {ticker} ...")
            dfs = []
            # Annuel
            dfs.append(await scrape_report(page, ticker, "Annuel", "1"))
            # Semestriels
            dfs.append(await scrape_report(page, ticker, "Semestriel", "1"))
            dfs.append(await scrape_report(page, ticker, "Semestriel", "2"))
            # Trimestriels
            for i in range(1, 4+1):
                dfs.append(await scrape_report(page, ticker, "Trimestriel", str(i)))

            part = pd.concat([d for d in dfs if isinstance(d, pd.DataFrame) and not d.empty],
                             ignore_index=True) if dfs else pd.DataFrame()
            if part.empty:
                all_parts.append(pd.DataFrame([{
                    "Ticker": ticker, "Type": "Aucune donnée",
                    "Période_détail": "", "Période": "", "en XOF": ""
                }]))
            else:
                all_parts.append(part)

            time.sleep(0.5 + random.random()*0.7)

        await browser.close()

    df_final = pd.concat(all_parts, ignore_index=True) if all_parts else pd.DataFrame()
    if df_final.empty:
        print("⚠️ Aucun rapport récupéré")
        return df_final

    first_cols = ["Ticker", "Type", "Période_détail", "Période", "en XOF"]
    df_final = df_final[first_cols + [c for c in df_final.columns if c not in first_cols]]
    df_final.to_csv(output_csv, index=False)
    print(f"✅ Fichier {output_csv} généré ({len(df_final)} lignes)")
    return df_final

# Exemple d'exécution dans Jupyter :
df = await main(headless=False)
# df.head()


📊 ABJC ...
📊 BICB ...
✅ Fichier rapport_brvm_complet.csv généré (35 lignes)
