In [2]:
# -*- coding: utf-8 -*-
import re
import time
import random
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# --------- Param√®tres ---------
TICKERS = [
    "ABJC","BICB","BICC","BNBC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CABC","CBIBF","CFAC","CIEC","ECOC","ETIT","FTSC","LNBB","NEIC","NSBC",
    "NTLC","ONTBF","ORAC","ORGT","PALC","PRSC","SAFC","SCRC","SDCC","SDSC",
    "SEMC","SGBC","SHEC","SIBC","SICC","SIVC","SLBC","SMBC","SNTS","SOGC",
    "SPHC","STAC","STBC","SVOC","TTLC","TTLS","UNLC","UNXC"
]

BASE_URL   = "https://www.richbourse.com/investisseur/rapport-activite/index/{ticker}/{type_}/{periode}"
AUTRES_URL = "https://www.richbourse.com/investisseur/analyse-societe/rapports-autres/{ticker}"

# --------- Helpers ---------
def clean_cell(val: str) -> str:
    if not val:
        return ""
    # coupe la variation (%) et nettoie espaces/virgules
    val = re.split(r"\(", val)[0]
    val = (val.replace("\xa0", " ")
              .replace(" ", "")
              .replace(",", ""))
    return val.strip()

def detect_unit_footer(soup: BeautifulSoup) -> str:
    patterns = [
        r"donn[√©e]es?.*xof",
        r"donn[√©e]es?.*francs?\s*cfa",
        r"donn[√©e]es?.*milliers.*xof",
        r"en\s+milliers\s+de\s+xof"
    ]
    # on balaie pas mal d'√©l√©ments de fin de page
    for el in soup.select("p, small, div, span, footer")[::-1]:
        txt = el.get_text(" ", strip=True)
        if not txt:
            continue
        for pat in patterns:
            if re.search(pat, txt, flags=re.I):
                return txt
    return ""

def period_label_from_type(type_: str, periode: str) -> str:
    t = type_.lower()
    if t.startswith("trimes"):
        return f"T{periode}"
    if t.startswith("semest"):
        return "S1" if periode == "1" else "S2"
    return "An"

def period_label_from_text(txt: str) -> str:
    """Devine S1/S2/T1..T4/An √† partir d'un libell√© libre (rapports-autres)."""
    s = txt.lower()
    # trimestre
    m = re.search(r"(\bt\s*([1-4])\b|\btrimes?tre\s*([1-4])\b|\b([1-4])\s*e?r?\s*trimes?tre\b)", s)
    if m:
        # capture du num√©ro
        for g in m.groups()[1:]:
            if g and g.isdigit():
                return f"T{g}"
    # semestre
    m = re.search(r"semestre\s*([12])|\b([12])\s*e?r?\s*semestre\b|\bs([12])\b", s)
    if m:
        for g in m.groups():
            if g and g.isdigit():
                return f"S{g}"
    return "An"

def firstcol_contains_year(text: str, year: int) -> bool:
    return str(year) in (text or "")

def extract_headers_and_rows(table) -> tuple[list[str], list[list[str]]]:
    thead = table.find("thead")
    if thead:
        headers = [th.get_text(" ", strip=True) for th in thead.find_all("th")]
    else:
        first_tr = table.find("tr")
        headers = [td.get_text(" ", strip=True) for td in first_tr.find_all(["td","th"])]
    rows = []
    for tr in table.find_all("tr"):
        tds = [td.get_text(" ", strip=True) for td in tr.find_all("td")]
        if tds:
            rows.append(tds)
    return headers, rows

# --------- Scrapers ---------
async def scrape_report(page, ticker: str, type_: str, periode: str) -> pd.DataFrame:
    """Annuel/Semestriel/Trimestriel (tableau principal)."""
    url = BASE_URL.format(ticker=ticker, type_=type_, periode=periode)
    await page.goto(url, timeout=60000)

    try:
        await page.wait_for_selector("table.table", timeout=8000)
    except:
        return pd.DataFrame()

    html_page = await page.content()
    soup_page = BeautifulSoup(html_page, "html.parser")
    table = soup_page.select_one("table.table")
    if table is None:
        return pd.DataFrame()

    # ent√™tes
    thead = table.find("thead")
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all("th")]
    else:
        first_tr = table.find("tbody").find("tr")
        headers = [td.get_text(strip=True) for td in first_tr.find_all("td")]

    if "P√©riode" not in headers:
        headers = ["P√©riode"] + headers

    # lignes
    rows = []
    body = table.find("tbody")
    if body:
        for tr in body.find_all("tr"):
            tds = [td.get_text(" ", strip=True) for td in tr.find_all("td")]
            if not tds:
                continue
            if len(tds) == len(headers) - 1 and headers[0] == "P√©riode":
                tds = [""] + tds  # s√©cu si la p√©riode n‚Äôest pas rendue
            row = {headers[i]: clean_cell(tds[i]) for i in range(min(len(headers), len(tds)))}
            rows.append(row)

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    # unit√©
    unit_phrase = detect_unit_footer(soup_page)

    # colonnes additionnelles
    type_norm = "Annuel" if type_.lower().startswith("annuel") else ("Semestriel" if type_.lower().startswith("semest") else "Trimestriel")
    df["Ticker"] = ticker
    df["Type"] = type_norm
    df["P√©riode_d√©tail"] = period_label_from_type(type_norm, periode)
    df["en XOF"] = unit_phrase

    # ordre
    front = ["Ticker", "Type", "P√©riode_d√©tail", "P√©riode", "en XOF"]
    others = [c for c in df.columns if c not in front]
    return df[front + others]

async def scrape_rapports_autres(page, ticker: str) -> pd.DataFrame:
    """
    /rapports-autres/{ticker}
    - Conserve uniquement les lignes dont la 1 ≥·µâ cellule contient l'ann√©e courante
    - Devine P√©riode_d√©tail (S1/S2/T1..T4/An) depuis le libell√© de la 1 ≥·µâ cellule
    - P√©riode = ann√©e d√©tect√©e (courante)
    - en XOF = note sous le 1er tableau (fallback: d√©tection globale)
    """
    url = AUTRES_URL.format(ticker=ticker)
    await page.goto(url, timeout=60000)

    try:
        await page.wait_for_selector("table", timeout=8000)
    except:
        return pd.DataFrame()

    html = await page.content()
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.select("table")
    if not tables:
        return pd.DataFrame()

    current_year = datetime.now().year

    # Unit√© : d‚Äôabord l‚Äô√©l√©ment suivant le 1er tableau, sinon fallback global
    unit_phrase = ""
    first_table = tables[0]
    sib = first_table.find_next_sibling()
    while sib and sib.name in ["table","script","style"]:
        sib = sib.find_next_sibling()
    if sib:
        txt = sib.get_text(" ", strip=True)
        if re.search(r"(donn[√©e]es?|en)\s+.*(xof|francs?\s*cfa|milliers)", txt, flags=re.I):
            unit_phrase = txt
    if not unit_phrase:
        unit_phrase = detect_unit_footer(soup)

    collected = []
    for table in tables:
        headers, rows = extract_headers_and_rows(table)
        if not headers or not rows:
            continue

        if "P√©riode" not in headers:
            headers = ["P√©riode"] + headers

        for r in rows:
            # force une colonne "P√©riode" si absente
            if len(r) == len(headers) - 1:
                r = [""] + r
            if not r:
                continue
            first_cell_raw = r[0]
            if not firstcol_contains_year(first_cell_raw, current_year):
                continue

            row_dict = {headers[i]: clean_cell(r[i]) for i in range(min(len(headers), len(r)))}
            # Ajouts format final
            row_dict["Ticker"] = ticker
            row_dict["Type"] = "Autres"
            row_dict["P√©riode_d√©tail"] = period_label_from_text(first_cell_raw)
            row_dict["P√©riode"] = str(current_year)
            row_dict["en XOF"] = unit_phrase
            collected.append(row_dict)

    if not collected:
        return pd.DataFrame()

    df = pd.DataFrame(collected)
    front = ["Ticker", "Type", "P√©riode_d√©tail", "P√©riode", "en XOF"]
    others = [c for c in df.columns if c not in front]
    return df[front + others]

# --------- Orchestrateur ---------
async def main(output_csv: str = "rapport_brvm_complet.csv", headless: bool = True):
    all_parts = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)
        try:
            context = await browser.new_context(storage_state="cookies.json")
        except:
            context = await browser.new_context()
        page = await context.new_page()

        for ticker in TICKERS:
            print(f"üìä {ticker} ...")
            dfs = []
            # Annuel
            dfs.append(await scrape_report(page, ticker, "Annuel", "1"))
            # Semestriels
            dfs.append(await scrape_report(page, ticker, "Semestriel", "1"))
            dfs.append(await scrape_report(page, ticker, "Semestriel", "2"))
            # Trimestriels
            for i in range(1, 4+1):
                dfs.append(await scrape_report(page, ticker, "Trimestriel", str(i)))
            # Rapports "autres" (filtr√©s sur l'ann√©e courante)
            dfs.append(await scrape_rapports_autres(page, ticker))

            part = pd.concat([d for d in dfs if isinstance(d, pd.DataFrame) and not d.empty],
                             ignore_index=True) if dfs else pd.DataFrame()

            if part.empty:
                all_parts.append(pd.DataFrame([{
                    "Ticker": ticker, "Type": "Aucune donn√©e",
                    "P√©riode_d√©tail": "", "P√©riode": "", "en XOF": ""
                }]))
            else:
                all_parts.append(part)

            time.sleep(0.5 + random.random()*0.7)

        await browser.close()

    df_final = pd.concat(all_parts, ignore_index=True) if all_parts else pd.DataFrame()
    if df_final.empty:
        print("‚ö†Ô∏è Aucun rapport r√©cup√©r√©")
        return df_final

    first_cols = ["Ticker", "Type", "P√©riode_d√©tail", "P√©riode", "en XOF"]
    df_final = df_final[first_cols + [c for c in df_final.columns if c not in first_cols]]
    df_final.to_csv(output_csv, index=False)
    print(f"‚úÖ Fichier {output_csv} g√©n√©r√© ({len(df_final)} lignes)")
    return df_final

# --- Ex√©cution dans Jupyter ---
df = await main(headless=False)
# df.head()


üìä ABJC ...
üìä BICB ...
üìä BICC ...
üìä BNBC ...
üìä BOAB ...
üìä BOABF ...
üìä BOAC ...
üìä BOAM ...
üìä BOAN ...
üìä BOAS ...
üìä CABC ...
üìä CBIBF ...
üìä CFAC ...
üìä CIEC ...
üìä ECOC ...
üìä ETIT ...
üìä FTSC ...
üìä LNBB ...
üìä NEIC ...
üìä NSBC ...
üìä NTLC ...
üìä ONTBF ...
üìä ORAC ...
üìä ORGT ...
üìä PALC ...
üìä PRSC ...
üìä SAFC ...
üìä SCRC ...
üìä SDCC ...
üìä SDSC ...
üìä SEMC ...
üìä SGBC ...
üìä SHEC ...
üìä SIBC ...
üìä SICC ...
üìä SIVC ...
üìä SLBC ...
üìä SMBC ...
üìä SNTS ...
üìä SOGC ...
üìä SPHC ...
üìä STAC ...
üìä STBC ...
üìä SVOC ...
üìä TTLC ...
üìä TTLS ...
üìä UNLC ...
üìä UNXC ...
‚úÖ Fichier rapport_brvm_complet.csv g√©n√©r√© (1585 lignes)


In [3]:
import pandas as pd
import numpy as np



# Charger le fichier
df = pd.read_csv("rapport_brvm_complet.csv")

# --- Uniformisation en XOF (reprend la logique pr√©c√©dente, en vectoris√©) ---
def get_multiplier(unit: str) -> int:
    if pd.isna(unit):
        return 1
    unit = str(unit).lower()
    if "milliers" in unit:
        return 1_000
    elif "millions" in unit:
        return 1_000_000
    else:
        return 1

# Colonnes num√©riques √† corriger (√† partir de la 6e jusqu'√† la fin)
num_cols = df.columns[5:]

# Calcul vectoris√© du facteur
factors = df["en XOF"].map(get_multiplier)

# Conversion s√ªre et multiplication
df[num_cols] = df[num_cols].apply(
    lambda s: pd.to_numeric(s.replace("-", np.nan), errors="coerce") * factors
)

# Unit√©s uniformis√©es
df["en XOF"] = "XOF"

# --- Ops demand√©es ---
# 1) Supprimer la colonne "Type" si elle existe
if "Type" in df.columns:
    df = df.drop(columns=["Type"])

# 2) "P√©riode" -> 4 derniers caract√®res (ann√©e)
if "P√©riode" in df.columns:
    annee = (
        df["P√©riode"].astype(str).str[-4:].str.extract(r"(\d{4})")[0].astype("Int64")
    )
    df["P√©riode"] = annee

# --- D√©duplication avanc√©e sur (Ticker, P√©riode_d√©tail, P√©riode) en gardant la ligne la plus compl√®te ---
subset = ["Ticker", "P√©riode_d√©tail", "P√©riode"]
missing_subset = [c for c in subset if c not in df.columns]
if missing_subset:
    raise KeyError(f"Colonnes manquantes pour la d√©duplication: {missing_subset}")

# Compter "l'information" pr√©sente par ligne (champ non vide, non '-', non NaN)
# Sans modifier df: on cr√©e une copie normalis√©e pour le scoring
tmp = df.replace(
    to_replace=[r"^\s*$", r"^-+$", r"(?i)^nan$"],
    value=pd.NA,
    regex=True
)

# Score global de compl√©tude
score_all = tmp.notna().sum(axis=1)

# Score num√©rique (optionnel, utile comme briseur d'√©galit√©)
numeric_cols = [c for c in df.columns if c in num_cols]
score_num = tmp[numeric_cols].notna().sum(axis=1) if numeric_cols else pd.Series(0, index=df.index)

# Cr√©er une cl√© de tri pour choisir la "meilleure" ligne dans chaque groupe
df["__score_all__"] = score_all
df["__score_num__"] = score_num

# Identifier les doublons (groupes ayant au moins 2 lignes)
dupe_groups = df.duplicated(subset=subset, keep=False)
nb_groups = (
    df.loc[dupe_groups]
      .drop_duplicates(subset=subset)
      .shape[0]
)

print(f"üîé Groupes en doublon sur {tuple(subset)} : {nb_groups}")

# Garder, pour chaque groupe, l'index de la ligne avec le score max (puis score_num en tie-break)
# On tri d‚Äôabord par scores desc, puis on prend le premier de chaque groupe
df_sorted = df.sort_values(by=["__score_all__", "__score_num__"], ascending=False)
best_idx = df_sorted.groupby(subset, dropna=False, as_index=False).head(1).index

# Conserver ces lignes "meilleures" + toutes les lignes uniques (non en doublon)
df_dedup = pd.concat([
    df.loc[~dupe_groups],     # uniques
    df.loc[best_idx]          # meilleurs des groupes
], ignore_index=True)

# Supprimer d‚Äô√©ventuels doublons r√©siduels exacts et nettoyer colonnes techniques
df_dedup = df_dedup.drop(columns=["__score_all__", "__score_num__"], errors="ignore")
df_dedup = df_dedup.drop_duplicates().reset_index(drop=True)

# Export
df_dedup.to_csv("rapport_brvm_complet_uniformise.csv", index=False)
# Export en Excel
df_dedup.to_excel("rapport_brvm_complet_uniformise.xlsx", index=False)

print("‚úÖ Doublons (Ticker, P√©riode_d√©tail, P√©riode) r√©solus en gardant la ligne la plus compl√®te.")
print("‚úÖ Export r√©alis√© dans rapport_brvm_complet_uniformise.csv")


üîé Groupes en doublon sur ('Ticker', 'P√©riode_d√©tail', 'P√©riode') : 49
‚úÖ Doublons (Ticker, P√©riode_d√©tail, P√©riode) r√©solus en gardant la ligne la plus compl√®te.
‚úÖ Export r√©alis√© dans rapport_brvm_complet_uniformise.csv


In [4]:
import pandas as pd
import numpy as np

# Charger le CSV
df = pd.read_csv("rapport_brvm_complet_uniformise.csv")

# Colonnes utiles
colonnes_utiles = [
    "Ticker", "P√©riode_d√©tail", "P√©riode",
    "Chiffre d'Affaires", "Produit net bancaire",
    "Total Bilan",
    "R√©sultat d'Exploitation", "R√©sultat Brut d'Exploitation",
    "R√©sultat Net"
]
df = df[colonnes_utiles]

# --- Liste compl√®te des banques cot√©es √† la BRVM ---
banques = [
    "BICB","BICC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CBIBF","ECOC","NSBC","SGBC","SIBC","ETIT"  # <-- inclut ETIT
]

# --- Colonnes calcul√©es ---
df["CA_PNB"] = np.where(
    df["Ticker"].isin(banques),
    df["Produit net bancaire"],
    df["Chiffre d'Affaires"]
)

df["RE_RBE"] = np.where(
    df["Ticker"].isin(banques),
    df["R√©sultat Brut d'Exploitation"],
    df["R√©sultat d'Exploitation"]
)

# Ajouter une colonne Capitaux propres vide
df["Capitaux propres"] = None

# Colonnes finales
df_final = df[[
    "Ticker", "P√©riode_d√©tail", "P√©riode",
    "CA_PNB", "Total Bilan", "RE_RBE", "R√©sultat Net", "Capitaux propres"
]]

# Export
df_final.to_csv("rapport_brvm_complet_uniformise2.csv", index=False)

print("‚úÖ Fichier pr√™t : rapport_brvm_complet_uniformise2.csv")


‚úÖ Fichier pr√™t : rapport_brvm_complet_uniformise2.csv


In [5]:
import re
import pandas as pd
import numpy as np
from itertools import product

# Fichiers
SRC = "rapport_brvm_complet_uniformise2.csv"
DST = "rapport_brvm_complet_uniformise3.csv"

# Variables √† pivoter
vars_keep = ["CA_PNB", "Total Bilan", "RE_RBE", "R√©sultat Net", "Capitaux propres"]

# Charge
df = pd.read_csv(SRC)

# V√©rifications
cols_needed = ["Ticker","P√©riode_d√©tail","P√©riode"] + vars_keep
missing = [c for c in cols_needed if c not in df.columns]
if missing:
    raise ValueError(f"Colonnes manquantes dans {SRC}: {missing}")

# D√©dupe
df = df.sort_values(["P√©riode_d√©tail","P√©riode","Ticker"]).drop_duplicates(
    subset=["P√©riode_d√©tail","P√©riode","Ticker"], keep="first"
)

# Normalise "P√©riode" -> ann√©e num√©rique
def extract_year(x):
    s = str(x)
    m = re.search(r"\d{4}", s)
    return int(m.group(0)) if m else np.nan

df["__Year"] = df["P√©riode"].apply(extract_year)

# Range d'ann√©es: 1998 -> max ann√©e d√©tect√©e (au moins 1998)
year_min = 1998
year_max = int(np.nanmax(df["__Year"])) if df["__Year"].notna().any() else 1998
years = list(range(year_min, year_max + 1))

# Nouvel ordre souhait√©: T1..T4, S1..S2, puis An
ordre_pd = ["T1","T2","T3","T4","S1","S2","An"]
df["__PeriodeOrd"] = pd.Categorical(df["P√©riode_d√©tail"], categories=ordre_pd, ordered=True)

# Tickers
tickers = sorted(df["Ticker"].unique())

# Index complet (P√©riode_d√©tail, Year, Ticker)
full_index = pd.MultiIndex.from_tuples(
    list(product(ordre_pd, years, tickers)),
    names=["P√©riode_d√©tail", "__Year", "Ticker"]
)

# R√©indexe pour compl√©ter les manquants
base = df.set_index(["P√©riode_d√©tail","__Year","Ticker"])
base = base[vars_keep].reindex(full_index)

# Reconstruit "P√©riode" depuis l'ann√©e
base = base.reset_index()
base["P√©riode"] = base["__Year"].astype("Int64").astype(str)

# Pivot large: colonnes "Ticker_Variable"
base = base.set_index(["P√©riode_d√©tail","P√©riode","Ticker"])
pieces = []
for v in vars_keep:
    w = base[v].unstack("Ticker")      # colonnes=Tickers
    w = w.add_suffix(f"_{v}")          # "ABJC_CA_PNB", etc.
    pieces.append(w)

wide = pd.concat(pieces, axis=1).reset_index()

# Tri: ann√©e croissante puis ordre custom T‚ÜíS‚ÜíAn
def extract_year_safe(x):
    try:
        return int(re.search(r"\d{4}", str(x)).group(0))
    except:
        return np.nan

wide["__Year"] = wide["P√©riode"].apply(extract_year_safe)
wide["__PeriodeOrd"] = pd.Categorical(wide["P√©riode_d√©tail"], categories=ordre_pd, ordered=True)
wide = wide.sort_values(["__Year","__PeriodeOrd","P√©riode_d√©tail","P√©riode"], na_position="last")

# --- NOUVEAU : r√©ordonner les colonnes par Ticker puis variables ---
fixed_prefix = ["P√©riode_d√©tail", "P√©riode"]
candidate = [f"{t}_{v}" for t in tickers for v in vars_keep]
ordered_data_cols = [c for c in candidate if c in wide.columns]  # garde l'ordre souhait√©, ignore si colonne manquante
other_cols = [c for c in wide.columns if c not in fixed_prefix + ordered_data_cols + ["__Year","__PeriodeOrd"]]
wide = wide[fixed_prefix + ordered_data_cols + other_cols]
# -------------------------------------------------------------------

# Nettoyage
wide = wide.drop(columns=["__Year","__PeriodeOrd"], errors="ignore")

# Export
wide.to_csv(DST, index=False)
print(f"‚úÖ Pivot compl√©t√© & tri√© (T‚ÜíS‚ÜíAn) et colonnes group√©es par ticker export√© : {DST}")


‚úÖ Pivot compl√©t√© & tri√© (T‚ÜíS‚ÜíAn) et colonnes group√©es par ticker export√© : rapport_brvm_complet_uniformise3.csv


In [6]:
import pandas as pd

SRC = "rapport_brvm_complet_uniformise3.csv"

AN_DST = "rapport_brvm_Annuel.csv"
TRI_DST = "rapport_brvm_Trimestriel.csv"
SEM_DST = "rapport_brvm_Semestriel.csv"

XLSX_DST = "rapport_brvm_decoupe.xlsx"

# charge la table pivot large
df = pd.read_csv(SRC)

# garde uniquement les colonnes attendues au cas o√π
assert "P√©riode_d√©tail" in df.columns, "Colonne 'P√©riode_d√©tail' manquante dans le fichier source."

# filtres
annuel = df[df["P√©riode_d√©tail"] == "An"].copy()
trimestriel = df[df["P√©riode_d√©tail"].isin(["T1","T2","T3","T4"])].copy()
semestriel = df[df["P√©riode_d√©tail"].isin(["S1","S2"])].copy()

# export CSV
annuel.to_csv(AN_DST, index=False)
trimestriel.to_csv(TRI_DST, index=False)
semestriel.to_csv(SEM_DST, index=False)

# export Excel avec 3 onglets
with pd.ExcelWriter(XLSX_DST, engine="openpyxl") as writer:
    annuel.to_excel(writer, sheet_name="Annuel", index=False)
    trimestriel.to_excel(writer, sheet_name="Trimestriel", index=False)
    semestriel.to_excel(writer, sheet_name="Semestriel", index=False)

print("‚úÖ Fichiers cr√©√©s :")
print(f"- {AN_DST}")
print(f"- {TRI_DST}")
print(f"- {SEM_DST}")
print(f"- {XLSX_DST} (avec 3 onglets)")


‚úÖ Fichiers cr√©√©s :
- rapport_brvm_Annuel.csv
- rapport_brvm_Trimestriel.csv
- rapport_brvm_Semestriel.csv
- rapport_brvm_decoupe.xlsx (avec 3 onglets)
