In [1]:
import time, re, sys, os
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
import pandas as pd

# ------------ Paramètres ------------
TICKERS = [
    "ABJC","BICB","BICC","BNBC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CABC","CBIBF","CFAC","CIEC","ECOC","ETIT","FTSC","LNBB","NEIC","NSBC",
    "NTLC","ONTBF","ORAC","ORGT","PALC","PRSC","SAFC","SCRC","SDCC","SDSC",
    "SEMC","SGBC","SHEC","SIBC","SICC","SIVC","SLBC","SMBC","SNTS","SOGC",
    "SPHC","STAC","STBC","SVOC","TTLC","TTLS","UNLC","UNXC"
]
BASE = "https://www.richbourse.com/common/dividende/index/{ticker}"
HEADERS = {"User-Agent": "Mozilla/5.0"}
OUT_CSV = "dividende_histo.csv"

COLS = [
    "Ticker","Exercice","Symbole","Nom","Dividende","Dividende ajusté",
    "Rendement","Ex-dividende","Date paiement","Statut"
]

# ------------ Utils scraping ------------
def clean_num(txt: str) -> str:
    if not txt: return ""
    txt = txt.strip().replace("\xa0"," ").replace(" ","").replace("%","").replace(",",".")
    m = re.search(r"-?\d+(?:\.\d+)?", txt)
    return m.group(0) if m else ""

def fetch_html(url: str) -> BeautifulSoup|None:
    for attempt in range(3):
        try:
            r = requests.get(url, headers=HEADERS, timeout=20)
            if r.status_code == 200:
                return BeautifulSoup(r.text, "html.parser")
        except requests.RequestException:
            pass
        time.sleep(0.5 + attempt*0.5)
    return None

def parse_page(soup: BeautifulSoup, ticker: str) -> List[Dict]:
    rows = []
    target = None
    for tb in soup.find_all("table"):
        heads = [th.get_text(strip=True).lower() for th in tb.find_all("th")]
        if all(k in " ".join(heads) for k in ["société","dividende","rendement","ex-dividende","date paiement"]):
            target = tb
            break
    if not target:
        return rows

    headers = [th.get_text(strip=True).lower() for th in target.find_all("th")]
    idx_nom = headers.index("société")
    idx_div = headers.index("dividende")
    idx_rdt = headers.index("rendement")
    idx_exd = headers.index("ex-dividende")
    idx_pay = headers.index("date paiement")

    tbody = target.find("tbody") or target
    for tr in tbody.find_all("tr"):
        tds = tr.find_all("td")
        if len(tds) < len(headers): 
            continue
        nom = tds[idx_nom].get_text(strip=True)
        dividende = clean_num(tds[idx_div].get_text())
        rendement = clean_num(tds[idx_rdt].get_text())
        ex_div = tds[idx_exd].get_text(strip=True)
        date_pay = tds[idx_pay].get_text(strip=True)

        # Exercice = année du paiement
        exercice = ""
        m = re.search(r"\b(20\d{2}|19\d{2})\b", date_pay)
        if m: exercice = m.group(1)

        rows.append({
            "Ticker": ticker,
            "Exercice": exercice,
            "Symbole": ticker,
            "Nom": nom,
            "Dividende": dividende,
            "Dividende ajusté": dividende,
            "Rendement": rendement,
            "Ex-dividende": ex_div,
            "Date paiement": date_pay,
            "Statut": ""
        })
    return rows

def scrape_all(tickers: List[str]) -> pd.DataFrame:
    all_rows: List[Dict] = []
    for i, t in enumerate(tickers, 1):
        url = BASE.format(ticker=t)
        print(f"[{i}/{len(tickers)}] {t} → {url}")
        soup = fetch_html(url)
        if not soup:
            print(f"⚠️  échec chargement {t}", file=sys.stderr)
            continue
        all_rows.extend(parse_page(soup, t))
        time.sleep(0.25)
    df = pd.DataFrame(all_rows, columns=COLS)
    # tri pour lecture
    df["Exercice_num"] = pd.to_numeric(df["Exercice"], errors="coerce")
    df = df.sort_values(["Ticker","Exercice_num"], ascending=[True, False]).drop(columns=["Exercice_num"])
    return df

# ------------ Pipeline complet ------------
# 1) Scrape
df_new = scrape_all(TICKERS)

# 2) Charger l'historique existant s'il est présent
if os.path.exists(OUT_CSV):
    df_old = pd.read_csv(OUT_CSV, dtype=str)
    # s’assurer que les colonnes sont dans le même ordre
    for c in COLS:
        if c not in df_old.columns:
            df_old[c] = ""
    df_old = df_old[COLS]
else:
    df_old = pd.DataFrame(columns=COLS, dtype=str)

# 3) Empiler (en hauteur) + dédoublonner
df_all = pd.concat([df_old, df_new], ignore_index=True)

# Nettoyage léger pour éviter les doublons “fantômes”
for c in ["Ticker","Exercice","Symbole","Nom","Ex-dividende","Date paiement","Statut"]:
    df_all[c] = df_all[c].fillna("").astype(str).str.strip()

# On considère qu’une ligne est unique par:
# (Ticker, Exercice, Ex-dividende, Date paiement, Dividende)
df_all = df_all.drop_duplicates(
    subset=["Ticker","Exercice","Ex-dividende","Date paiement","Dividende"],
    keep="last"
)

# 4) Tri final
df_all["Exercice_num"] = pd.to_numeric(df_all["Exercice"], errors="coerce")
df_all = df_all.sort_values(["Ticker","Exercice_num","Date paiement"], ascending=[True, False, True]).drop(columns=["Exercice_num"])
df_all = df_all[COLS]

# 5) Export final en remplaçant le fichier
df_all.to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"✅ Export mis à jour: {OUT_CSV} ({len(df_all)} lignes)")


[1/48] ABJC → https://www.richbourse.com/common/dividende/index/ABJC
[2/48] BICB → https://www.richbourse.com/common/dividende/index/BICB
[3/48] BICC → https://www.richbourse.com/common/dividende/index/BICC
[4/48] BNBC → https://www.richbourse.com/common/dividende/index/BNBC
[5/48] BOAB → https://www.richbourse.com/common/dividende/index/BOAB
[6/48] BOABF → https://www.richbourse.com/common/dividende/index/BOABF
[7/48] BOAC → https://www.richbourse.com/common/dividende/index/BOAC
[8/48] BOAM → https://www.richbourse.com/common/dividende/index/BOAM
[9/48] BOAN → https://www.richbourse.com/common/dividende/index/BOAN
[10/48] BOAS → https://www.richbourse.com/common/dividende/index/BOAS
[11/48] CABC → https://www.richbourse.com/common/dividende/index/CABC
[12/48] CBIBF → https://www.richbourse.com/common/dividende/index/CBIBF
[13/48] CFAC → https://www.richbourse.com/common/dividende/index/CFAC
[14/48] CIEC → https://www.richbourse.com/common/dividende/index/CIEC
[15/48] ECOC → https://ww