In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import os
import time
import math
import json
import traceback
from typing import Dict, List, Optional

import requests
from bs4 import BeautifulSoup
import pandas as pd

# -----------------------------
# Paramètres
# -----------------------------

TICKERS = [
    "ABJC","BICB","BICC","BNBC","BOAB","BOABF","BOAC","BOAM","BOAN","BOAS",
    "CABC","CBIBF","CFAC","CIEC","ECOC","ETIT","FTSC","LNBB","NEIC","NSBC",
    "NTLC","ONTBF","ORAC","ORGT","PALC","PRSC","SAFC","SCRC","SDCC","SDSC",
    "SEMC","SGBC","SHEC","SIBC","SICC","SIVC","SLBC","SMBC","SNTS","SOGC",
    "SPHC","STAC","STBC","SVOC","TTLC","TTLS","UNLC","UNXC"
]

BASE_URL = "https://www.richbourse.com/common/apprendre/details-societe/{ticker}"

# Fichiers de sortie
OUT_CSV = "richbourse_societes.csv"
OUT_XLSX = "richbourse_societes.xlsx"

# Dossier des logos
LOGO_DIR = "logos_richbourse"

# Colonnes demandées (ordre)
OUTPUT_COLUMNS = [
    "ticker",
    "Société",
    "Secteur d'activité",
    "Pays",
    "Introduction à la BRVM",
    "Nombre de titres",
    "Flottant",
    "Site Web",
    "Présentation",
    "Déterminants Sectoriel",
    "Logo_Path"
]

# -----------------------------
# Session HTTP robuste
# -----------------------------

def make_session() -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_6) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/126.0 Safari/537.36"
        ),
        "Accept-Language": "fr-FR,fr;q=0.9"
    })
    adapter = requests.adapters.HTTPAdapter(max_retries=3)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    return s

SESSION = make_session()

# -----------------------------
# Utils parsing
# -----------------------------

LABEL_KEYS = {
    "Société": "Société",
    "Secteur d'activité": "Secteur d'activité",
    "Pays": "Pays",
    "Introduction à la BRVM": "Introduction à la BRVM",
    "Nombre de titres": "Nombre de titres",
    "Flottant": "Flottant",
    "Site Web": "Site Web",
}

LABELS_LOWER = {k.lower(): v for k, v in LABEL_KEYS.items()}

HEADER_PRESENTATION_RE = re.compile(r"^\s*Présentation\s*$", re.IGNORECASE)
HEADER_DETERMINANTS_RE = re.compile(r"^\s*Déterminants?\s+Sectoriel", re.IGNORECASE)
HEADER_STOP_WORDS = [
    re.compile(r"^\s*Principaux actionnaires", re.IGNORECASE),
    re.compile(r"^\s*Actionnariat", re.IGNORECASE),
    re.compile(r"^\s*Contacts?", re.IGNORECASE),
]

def normalize_space(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def extract_fields_block(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Cherche les paragraphes/divs qui commencent par "Société :", "Pays :", etc.
    Renvoie un dict {label_canonique: valeur}
    """
    data = {}
    candidates = soup.select("p, li, div")
    for tag in candidates:
        text = normalize_space(" ".join(tag.stripped_strings))
        if ":" not in text:
            continue
        left, right = text.split(":", 1)
        left_n = left.strip().lower()
        if left_n in LABELS_LOWER:
            canon = LABELS_LOWER[left_n]
            val = right.strip()
            # Nettoyages fréquents
            val = re.sub(r"\s{2,}", " ", val)
            data[canon] = val
    return data

def _collect_text_until(next_node, stop_patterns: List[re.Pattern]) -> str:
    """
    Concatène le texte des nœuds suivants jusqu'à tomber sur un en-tête stop.
    """
    parts = []
    node = next_node
    while node:
        # Si on tombe sur un header identifiable, on stoppe
        if hasattr(node, "get_text"):
            t = normalize_space(node.get_text(separator=" ", strip=True))
            if any(p.search(t) for p in [HEADER_PRESENTATION_RE, HEADER_DETERMINANTS_RE] + HEADER_STOP_WORDS):
                # Si c'est nous-même au tout début, on continue; sinon on s'arrête
                if not parts:  # évite de stopper sur le même header trouvé
                    node = node.find_next_sibling()
                    continue
                break
            # On ne garde que le texte "paragraphe-like"
            if node.name in ("p", "div", "li"):
                if t and not any(p.search(t) for p in [HEADER_PRESENTATION_RE, HEADER_DETERMINANTS_RE]):
                    parts.append(t)
        node = node.find_next_sibling()
    # Dedup lignes trop proches
    uniq = []
    for p in parts:
        if not uniq or p != uniq[-1]:
            uniq.append(p)
    return "\n".join(uniq).strip()

def extract_section_text(soup: BeautifulSoup, header_regex: re.Pattern) -> Optional[str]:
    """
    Trouve un titre (h*, strong, b) qui matche header_regex et renvoie le texte
    des paragraphes suivants jusqu'au prochain header pertinent.
    """
    # Cherche un header classique
    header = None
    for tag in soup.find_all(True):
        if tag.name in ("h1","h2","h3","h4","h5","h6","strong","b"):
            txt = normalize_space(tag.get_text(separator=" ", strip=True))
            if header_regex.search(txt):
                header = tag
                break
    if not header:
        # fallback: chercher texte brut
        header = soup.find(string=header_regex)
        header = header.parent if header and hasattr(header, "parent") else None
    if not header:
        return None
    text = _collect_text_until(header.find_next_sibling(), stop_patterns=HEADER_STOP_WORDS)
    return text if text else None

def find_logo_url(soup: BeautifulSoup) -> Optional[str]:
    """
    Essaie de localiser un logo plausible.
    Heuristique: première image dans la colonne de gauche, ou img dont src contient 'logo' ou 'logos'.
    """
    imgs = soup.find_all("img")
    best = None
    for img in imgs:
        src = img.get("src") or ""
        alt = (img.get("alt") or "").lower()
        if not src:
            continue
        # Compléter src relatif
        if src.startswith("//"):
            src = "https:" + src
        elif src.startswith("/"):
            src = "https://www.richbourse.com" + src
        # Heuristiques
        score = 0
        s_low = src.lower()
        if "logo" in s_low or "logos" in s_low:
            score += 2
        if "apprendre" in s_low or "societe" in s_low:
            score += 1
        if alt and ("logo" in alt or "soc" in alt):
            score += 1
        # dimensions (si présentes)
        try:
            w = int(img.get("width") or 0)
            h = int(img.get("height") or 0)
            if w >= 80 and h >= 40:
                score += 1
        except Exception:
            pass
        if best is None or score > best[0]:
            best = (score, src)
    return best[1] if best else None

def download_logo(url: str, ticker: str, out_dir: str = LOGO_DIR) -> Optional[str]:
    os.makedirs(out_dir, exist_ok=True)
    try:
        r = SESSION.get(url, timeout=20)
        r.raise_for_status()
        # extension
        ext = ".png"
        ctype = r.headers.get("Content-Type", "")
        if "jpeg" in ctype or "jpg" in ctype:
            ext = ".jpg"
        elif "svg" in ctype:
            ext = ".svg"
        elif "webp" in ctype:
            ext = ".webp"
        path = os.path.join(out_dir, f"{ticker}{ext}")
        with open(path, "wb") as f:
            f.write(r.content)
        return path
    except Exception:
        return None

# -----------------------------
# Scraper principal
# -----------------------------

def scrape_one(ticker: str) -> Dict[str, Optional[str]]:
    url = BASE_URL.format(ticker=ticker)
    row = {col: None for col in OUTPUT_COLUMNS}
    row["ticker"] = ticker

    try:
        resp = SESSION.get(url, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "lxml")

        # Champs simples
        fields = extract_fields_block(soup)
        for k, v in fields.items():
            if k in row:
                row[k] = v

        # Présentation
        pres = extract_section_text(soup, HEADER_PRESENTATION_RE)
        if pres:
            row["Présentation"] = pres

        # Déterminants Sectoriel (facultatif)
        det = extract_section_text(soup, HEADER_DETERMINANTS_RE)
        if det:
            row["Déterminants Sectoriel"] = det

        # Logo
        logo_url = find_logo_url(soup)
        if logo_url:
            logo_path = download_logo(logo_url, ticker)
            if logo_path:
                row["Logo_Path"] = logo_path

    except Exception as e:
        # On loggue dans la console, et on laisse la ligne partiellement remplie
        print(f"[{ticker}] ERREUR: {e}")
        traceback.print_exc()

    return row

def scrape_all(tickers: List[str]) -> pd.DataFrame:
    rows = []
    for i, t in enumerate(tickers, 1):
        print(f"[{i}/{len(tickers)}] {t} ...")
        rows.append(scrape_one(t))
        # Petite pause pour être poli avec le site
        time.sleep(0.7)
    df = pd.DataFrame(rows, columns=OUTPUT_COLUMNS)
    return df

# -----------------------------
# Main
# -----------------------------

def main():
    df = scrape_all(TICKERS)

    # Nettoyages légers
    # Nombre de titres -> nombre (si possible)
    def to_int(x):
        if pd.isna(x):
            return x
        s = str(x).replace(" ", "").replace("\xa0", "")
        s = re.sub(r"[^\d]", "", s)
        return int(s) if s.isdigit() else x

    df["Nombre de titres"] = df["Nombre de titres"].map(to_int)

    # Flottant -> pourcentage brut (string conservé si format exotique)
    def to_pct(x):
        if pd.isna(x): return x
        s = str(x).replace(",", ".")
        m = re.search(r"(\d+(?:\.\d+)?)\s*%?", s)
        return float(m.group(1)) if m else x

    df["Flottant"] = df["Flottant"].map(to_pct)

    # Sauvegardes
    df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    try:
        df.to_excel(OUT_XLSX, index=False)
    except Exception as e:
        print(f"Impossible d'écrire l'XLSX (openpyxl non installé ?) : {e}")

    print("\nTerminé.")
    print(f"CSV : {OUT_CSV}")
    print(f"XLSX : {OUT_XLSX}")
    print(f"Logos : {os.path.abspath(LOGO_DIR)}")

if __name__ == "__main__":
    main()
