In [1]:
import pandas as pd
import numpy as np
from fractions import Fraction
import re

# =======================
# FICHIERS
# =======================
INPUT_PRICES_CSV = "combined_journalier.csv"
OUTPUT_SHARES_DAILY = "recon_nb_actions_journalier.csv"
OUTPUT_EVENTS = "recap_changements_capitaux.csv"
OUTPUT_CURRENT_SHARES = "nb_actions_actuel.csv"
OUTPUT_ENRICHED = "combined_journalier_capi.csv"

# =======================
# PARAMÈTRES
# =======================
EPS_RATIO_VALID = 1e-12

# Détection et lissage des paliers (anti micro-écarts)
EVENT_CHANGE_THRESH = 0.008   # 0.8% pour détecter les "vrais" sauts (avant consolidation)
MIN_SEG_LEN = 3               # longueur minimale d'un segment (jours) avant fusion
MERGE_TOL = 0.004             # 0.4%: si 2 segments voisins ont des médianes trop proches -> fusion
Q_TOL = 0.006                 # 0.6%: tolérance pour arrondir un palier à un ratio p/q simple

# Typage Split vs Augmentation
CAP_TOL = 0.075               # 7.5%: cap ~inchangée vs changement significatif
WINDOW = 3                    # jours médiane avant/après pour la cap
RATIONAL_TOL = 0.005          # 0.5%
MAX_DENOM = 20

# =======================
# UTILS
# =======================
def normalize_cols(cols):
    return [c.strip().lower() for c in cols]

def normalize_ticker(x: str) -> str:
    if pd.isna(x):
        return x
    s = str(x).strip().upper()
    s = s.split()[0]                      # garde la 1ère partie avant espace
    s = re.split(r"[^A-Z0-9]+", s)[0]     # bloc alphanum initial
    return s

def find_col(df, targets):
    for cand in targets:
        if cand in df.columns:
            return cand
    return None

def approx_simple_ratio(x, tol=RATIONAL_TOL, max_den=MAX_DENOM):
    if not np.isfinite(x) or x <= 0:
        return (None, None, None, False)
    frac = Fraction(x).limit_denominator(max_den)
    x_hat = frac.numerator / frac.denominator
    rel_err = abs(x - x_hat) / x
    return (frac.numerator, frac.denominator, x_hat, rel_err <= tol)

# =======================
# TABLE NOMBRE D'ACTIONS ACTUEL
# =======================

df_current = base = pd.read_csv("richbourse_nb_actions_actuel.csv")
df_current["Ticker"] = df_current["Ticker"].apply(normalize_ticker)
df_current.to_csv(OUTPUT_CURRENT_SHARES, index=False)
print(f"✅ Écrit: {OUTPUT_CURRENT_SHARES} ({len(df_current)} tickers)")

# =======================
# CHARGER TABLE D'ORIGINE
# =======================
base = pd.read_csv(INPUT_PRICES_CSV)
base.columns = normalize_cols(base.columns)

col_map_candidates = {
    "ticker": ["ticker","symbole","code"],
    "date": ["date"],
    "cours_ajuste": ["cours ajuste","cours ajusté","close ajusté","px ajusté","prix ajusté","cours ajustee","cours_ajuste","cours_ajusté"],
    "cours_normal": ["cours normal","close","px normal","prix normal","cours_normal","prix"],
    "vol_adj": ["volume ajuste total","volume ajusté total","vol ajusté","volume ajusté","vol_ajuste","vol_ajusté","volume ajuste","volume ajustee"],
    "vol_norm": ["volume normal total","vol normal total","volume","volume normal","vol normal","vol_normal"],
}

col_ticker = find_col(base, col_map_candidates["ticker"])
col_date   = find_col(base, col_map_candidates["date"])
col_ca     = find_col(base, col_map_candidates["cours_ajuste"])
col_cn     = find_col(base, col_map_candidates["cours_normal"])
col_va     = find_col(base, col_map_candidates["vol_adj"])
col_vn     = find_col(base, col_map_candidates["vol_norm"])

required = [col_ticker, col_date, (col_ca or col_va), (col_cn or col_vn)]
if any(c is None for c in required):
    raise ValueError("Colonnes manquantes dans combined_journalier.csv (il faut au moins Ticker, Date, et l'un des couples prix/vol. ajusté/normal).")

df = base[[c for c in [col_ticker,col_date,col_ca,col_cn,col_va,col_vn] if c is not None]].copy()
rename_map = {}
if col_ticker: rename_map[col_ticker] = "Ticker"
if col_date:   rename_map[col_date]   = "Date"
if col_ca:     rename_map[col_ca]     = "Cours_Ajuste"
if col_cn:     rename_map[col_cn]     = "Cours_Normal"
if col_va:     rename_map[col_va]     = "Vol_Ajuste"
if col_vn:     rename_map[col_vn]     = "Vol_Normal"
df = df.rename(columns=rename_map)

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date","Ticker"])
df["Ticker"] = df["Ticker"].apply(normalize_ticker)
df = df.sort_values(["Ticker","Date"])

for c in ["Cours_Ajuste","Cours_Normal","Vol_Ajuste","Vol_Normal"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# =======================
# RATIO JOUR: PRIX PRIORITAIRE, VOLUME EN SECOURS
# =======================
def compute_ratio_row(row):
    ca = row.get("Cours_Ajuste", np.nan)
    cn = row.get("Cours_Normal", np.nan)
    va = row.get("Vol_Ajuste", np.nan)
    vn = row.get("Vol_Normal", np.nan)

    r_price = np.nan
    r_vol = np.nan
    if pd.notna(ca) and pd.notna(cn) and abs(ca) > EPS_RATIO_VALID:
        r_price = cn / ca
    if pd.notna(va) and pd.notna(vn) and abs(va) > EPS_RATIO_VALID:
        r_vol = vn / va

    if pd.notna(r_price) and np.isfinite(r_price) and r_price > 0:
        return r_price
    if pd.notna(r_vol) and np.isfinite(r_vol) and r_vol > 0:
        return r_vol
    return np.nan

df["ratio_raw"] = df.apply(compute_ratio_row, axis=1)

# Agrégation par jour (médiane robuste si multi-lignes)
ratio_daily = (
    df.groupby(["Ticker","Date"], as_index=False)["ratio_raw"]
      .median()
      .rename(columns={"ratio_raw":"ratio"})
)

# =======================
# NORMALISATION SUR LE DERNIER JOUR
# =======================
def normalize_ratio(group):
    group = group.sort_values("Date")
    if group["ratio"].notna().any():
        last_ratio = group["ratio"].dropna().iloc[-1]
        if last_ratio and np.isfinite(last_ratio) and last_ratio > 0:
            group["ratio_norm"] = group["ratio"] / last_ratio
        else:
            group["ratio_norm"] = np.nan
    else:
        group["ratio_norm"] = np.nan
    group["ratio_norm"] = group["ratio_norm"].ffill().bfill()
    return group

ratio_daily = ratio_daily.groupby("Ticker", group_keys=False).apply(normalize_ratio)

# =======================
# SEGMENTATION + SNAPPING DES PALIERS (anti micro-écarts)
# =======================
def segment_and_snap(group):
    g = group.sort_values("Date").reset_index(drop=True).copy()
    r = g["ratio_norm"].values.astype(float)

    if np.all(pd.isna(r)):
        g["ratio_norm_snapped"] = np.nan
        return g

    # indices de rupture initiale
    change = [0]
    for i in range(1, len(r)):
        prev, cur = r[i-1], r[i]
        if pd.isna(prev) or pd.isna(cur) or prev <= 0:
            continue
        if abs(cur/prev - 1.0) > EVENT_CHANGE_THRESH:
            change.append(i)
    if change[-1] != len(r):
        change.append(len(r))

    # construire segments [start, end)
    segments = []
    for s, e in zip(change[:-1], change[1:]):
        segments.append((s, e))

    # fusionner segments trop courts ou niveaux trop proches
    merged = []
    i = 0
    while i < len(segments):
        s, e = segments[i]
        # niveau médian du segment
        level = np.nanmedian(r[s:e])
        # essayer de fusionner avec le suivant si conditions
        j = i + 1
        while j < len(segments):
            s2, e2 = segments[j]
            level2 = np.nanmedian(r[s2:e2])
            rel_diff = abs(level2 - level) / max(abs(level), EPS_RATIO_VALID)
            seg_len = e - s
            seg2_len = e2 - s2
            if (rel_diff <= MERGE_TOL) or (seg2_len < MIN_SEG_LEN) or (seg_len < MIN_SEG_LEN):
                # fusion
                e = e2
                level = np.nanmedian(r[s:e])
                j += 1
            else:
                break
        merged.append((s, e))
        i = j

    # calcul niveaux "snappés" paliers propres
    snapped_levels = []
    for s, e in merged:
        level = float(np.nanmedian(r[s:e]))
        # si le niveau est proche d'un ratio p/q simple, on le "snap" dessus
        p,q,xhat,is_simple = approx_simple_ratio(level, tol=Q_TOL, max_den=MAX_DENOM)
        snapped = xhat if is_simple else level
        snapped_levels.append((s, e, snapped))

    # assigner
    r_snap = np.copy(r)
    for s, e, lev in snapped_levels:
        r_snap[s:e] = lev

    g["ratio_norm_snapped"] = r_snap
    return g

ratio_snapped = ratio_daily.groupby("Ticker", group_keys=False).apply(segment_and_snap)

# =======================
# RECONSTITUTION NB D'ACTIONS (sur paliers snappés)
# =======================
df_all = ratio_snapped.merge(df_current, on="Ticker", how="left")

def compute_shares(row):
    r = row["ratio_norm_snapped"]
    n = row["nb_actions_actuel"]
    if pd.notna(n) and pd.notna(r) and r > 0:
        return n / r
    return np.nan

df_all["nb_actions_estime"] = df_all.apply(compute_shares, axis=1)
df_all["nb_actions_estime"] = df_all["nb_actions_estime"].round().astype("Int64")

# =======================
# PRIX NORMAL EFFECTIF & MARKET CAP (pour typage événements)
# =======================
# Prix/ratio "bruts" par jour (pour proxy CN si manquant)
pr_daily = (
    df.groupby(["Ticker","Date"], as_index=False)[["Cours_Ajuste","Cours_Normal","Vol_Ajuste","Vol_Normal"]]
      .median()
      .merge(ratio_daily[["Ticker","Date","ratio"]], on=["Ticker","Date"], how="left")
)

def prix_normal_effectif_row(row):
    cn = row.get("Cours_Normal", np.nan)
    ca = row.get("Cours_Ajuste", np.nan)
    r  = row.get("ratio", np.nan)
    if pd.notna(cn) and np.isfinite(cn) and cn > 0:
        return cn
    if pd.notna(ca) and np.isfinite(ca) and ca > 0 and pd.notna(r) and np.isfinite(r) and r > 0:
        return ca * r
    return np.nan

cap_base = df_all.merge(pr_daily, on=["Ticker","Date"], how="left")
cap_base["Prix_Normal_Effectif"] = cap_base.apply(prix_normal_effectif_row, axis=1)
cap_base["MarketCap"] = cap_base["Prix_Normal_Effectif"] * cap_base["nb_actions_estime"]

# =======================
# DÉTECTION & CLASSIFICATION DES ÉVÉNEMENTS (sur paliers snappés)
# =======================
def classify_event(is_simple, cap_change):
    if pd.isna(cap_change):
        return ("Split/Regroupement" if is_simple else "Augmentation/Autre",
                0.6 if is_simple else 0.6,
                "Données cap insuffisantes; décision basée sur p:q uniquement")
    if is_simple and abs(cap_change) <= CAP_TOL:
        return ("Split/Regroupement", 0.9, "Ratio simple p:q et capitalisation ~inchangée")
    if (not is_simple) and (abs(cap_change) > CAP_TOL):
        return ("Augmentation/Autre", 0.85, "Ratio non simple et capitalisation change significativement")
    if is_simple:
        return ("Split/Regroupement", 0.7, "Ratio simple p:q mais variation cap ambiguë")
    else:
        return ("Augmentation/Autre", 0.7, "Ratio non simple mais variation cap ambiguë")

events = []
for tik, g in ratio_snapped.groupby("Ticker"):
    g = g.sort_values("Date").reset_index(drop=True).copy()
    if g.empty:
        continue
    # changement quand le palier snappé change (vrai step)
    g["snap_shift"] = g["ratio_norm_snapped"].shift(1)
    change_mask = (g["ratio_norm_snapped"].notna() & g["snap_shift"].notna() &
                   (g["ratio_norm_snapped"] != g["snap_shift"]))
    idxs = g.index[change_mask].tolist()

    # récup cap pour ce ticker
    gb = cap_base[cap_base["Ticker"] == tik].sort_values("Date").reset_index(drop=True)

    for idx in idxs:
        date_evt = g.loc[idx, "Date"]
        r_prev = g.loc[idx-1, "ratio_norm_snapped"]
        r_now  = g.loc[idx, "ratio_norm_snapped"]
        if not (pd.notna(r_prev) and pd.notna(r_now) and r_prev > 0):
            continue
        ratio_event = r_now / r_prev

        p,q,xhat,is_simple = approx_simple_ratio(ratio_event, tol=RATIONAL_TOL, max_den=MAX_DENOM)

        pre = gb[(gb["Date"] < date_evt) & gb["MarketCap"].notna()].tail(WINDOW)
        post = gb[(gb["Date"] > date_evt) & gb["MarketCap"].notna()].head(WINDOW)
        cap_pre = pre["MarketCap"].median() if not pre.empty else np.nan
        cap_post = post["MarketCap"].median() if not post.empty else np.nan
        cap_change = (cap_post - cap_pre) / cap_pre if (pd.notna(cap_pre) and cap_pre != 0 and pd.notna(cap_post)) else np.nan

        decision, conf, reason = classify_event(is_simple, cap_change)

        events.append({
            "Ticker": tik,
            "Date_Changement": date_evt.date(),
            "Ratio_Event": ratio_event,
            "k_pq": (f"{p}:{q}" if (p and q) else ""),
            "cap_pre": cap_pre,
            "cap_post": cap_post,
            "cap_change_pct": cap_change,
            "Decision": decision,
            "Confidence": round(conf, 2),
            "Reason": reason
        })

df_events = pd.DataFrame(events).sort_values(["Ticker","Date_Changement"]) if events else pd.DataFrame(
    columns=["Ticker","Date_Changement","Ratio_Event","k_pq","cap_pre","cap_post","cap_change_pct","Decision","Confidence","Reason"]
)

# =======================
# EXPORTS ANALYTIQUES
# =======================
out_daily = ratio_snapped[["Ticker","Date"]].merge(
    df_all[["Ticker","Date","nb_actions_estime","ratio_norm_snapped"]], on=["Ticker","Date"], how="left"
).sort_values(["Ticker","Date"])
out_daily.to_csv(OUTPUT_SHARES_DAILY, index=False)
df_events.to_csv(OUTPUT_EVENTS, index=False)
print(f"✅ Écrit: {OUTPUT_SHARES_DAILY}")
print(f"✅ Écrit: {OUTPUT_EVENTS}")

# =======================
# ENRICHI: + nb_actions_estime + Capitalisation
# =======================
# On repart du CSV d'origine pour garder les noms des colonnes originaux
enriched = pd.read_csv(INPUT_PRICES_CSV)
# Trouver noms originaux Ticker/Date
ticker_orig = None
date_orig = None
for c in enriched.columns:
    cl = c.strip().lower()
    if cl in col_map_candidates["ticker"] and ticker_orig is None:
        ticker_orig = c
    if cl in col_map_candidates["date"] and date_orig is None:
        date_orig = c
if ticker_orig is None or date_orig is None:
    raise ValueError("Impossible d'identifier les colonnes Ticker/Date dans le fichier source pour l'enrichissement final.")

tmp_merge = enriched[[ticker_orig, date_orig]].copy()
tmp_merge = tmp_merge.rename(columns={ticker_orig:"Ticker_raw", date_orig:"Date"})
tmp_merge["_Ticker_norm"] = tmp_merge["Ticker_raw"].apply(normalize_ticker)
tmp_merge["Date"] = pd.to_datetime(tmp_merge["Date"], errors="coerce")

# joindre nb_actions_estime (snappé) + ratio brut + prix pour proxy
tmp_merge = tmp_merge.merge(out_daily.rename(columns={"Ticker":"_Ticker_norm"}), on=["_Ticker_norm","Date"], how="left")
ratio_raw_daily = pr_daily.rename(columns={"Ticker":"_Ticker_norm"})
tmp_merge = tmp_merge.merge(ratio_raw_daily[["_Ticker_norm","Date","ratio","Cours_Ajuste","Cours_Normal"]],
                            on=["_Ticker_norm","Date"], how="left")

def prix_normal_effectif_final(row):
    cn = row.get("Cours_Normal", np.nan)
    ca = row.get("Cours_Ajuste", np.nan)
    r  = row.get("ratio", np.nan)
    if pd.notna(cn) and np.isfinite(cn) and cn > 0:
        return cn
    if pd.notna(ca) and np.isfinite(ca) and ca > 0 and pd.notna(r) and np.isfinite(r) and r > 0:
        return ca * r
    return np.nan

tmp_merge["Prix_Normal_Effectif"] = tmp_merge.apply(prix_normal_effectif_final, axis=1)
tmp_merge["Capitalisation"] = tmp_merge["Prix_Normal_Effectif"] * tmp_merge["nb_actions_estime"]

# injecter dans la table d'origine
enriched["nb_actions_estime"] = tmp_merge["nb_actions_estime"].values
enriched["Capitalisation"] = tmp_merge["Capitalisation"].values

# tri final: par ticker puis date décroissante
enriched[date_orig] = pd.to_datetime(enriched[date_orig], errors="coerce")
enriched = enriched.sort_values([ticker_orig, date_orig], ascending=[True, False])
# format date lisible (optionnel)
enriched[date_orig] = enriched[date_orig].dt.strftime("%Y-%m-%d")

enriched.to_csv(OUTPUT_ENRICHED, index=False)
print(f"✅ Écrit: {OUTPUT_ENRICHED} (tri par ticker, dates décroissantes)")

# APERÇUS
print("\nAperçu événements (top 10):")
print(df_events.head(50))
print("\nAperçu historique (top 10):")
print(out_daily.head(50))
print("\nAperçu enriched (top 5):")
print(enriched.head(50))


✅ Écrit: nb_actions_actuel.csv (48 tickers)


  df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
  ratio_daily = ratio_daily.groupby("Ticker", group_keys=False).apply(normalize_ratio)
  ratio_snapped = ratio_daily.groupby("Ticker", group_keys=False).apply(segment_and_snap)


✅ Écrit: recon_nb_actions_journalier.csv
✅ Écrit: recap_changements_capitaux.csv


  tmp_merge["Date"] = pd.to_datetime(tmp_merge["Date"], errors="coerce")
  enriched[date_orig] = pd.to_datetime(enriched[date_orig], errors="coerce")


✅ Écrit: combined_journalier_capi.csv (tri par ticker, dates décroissantes)

Aperçu événements (top 10):
   Ticker Date_Changement  Ratio_Event   k_pq       cap_pre      cap_post  \
0    ABJC      2016-09-30     0.050000   1:20  9.084240e+10  7.174640e+10   
1    BICC      2017-10-09     0.100000   1:10  1.386750e+11  1.482500e+11   
2    BNBC      2017-07-27     0.050000   1:20  3.444480e+10  3.841920e+10   
3    BOAB      2017-06-20     0.500000    1:2  2.175035e+11  2.088894e+11   
4    BOAB      2017-10-31     0.100000   1:10  1.419636e+11  1.470338e+11   
5    BOAB      2024-09-03     0.500000    1:2  1.521039e+11  1.829303e+11   
6   BOABF      2017-06-27     0.500000    1:2  2.200000e+11  2.197800e+11   
7   BOABF      2017-10-24     0.100000   1:10  1.645380e+11  1.672000e+11   
8   BOABF      2024-08-28     0.500000    1:2  1.502600e+11  1.760000e+11   
9    BOAC      2017-06-21     0.500000    1:2  1.720000e+11  1.769600e+11   
10   BOAC      2017-10-26     0.100000   1:10  8