In [None]:
pip install pyxlsb

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata
import warnings
import time
from datetime import datetime
import sys

# --------- Réduire le bruit ---------
warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)

# ================== PARAMS ================== #
ROOT_DIR = Path("fic")
SHOW_SAMPLES = True

# >>> Perf toggles
QUICK_MODE = True              # True = plus rapide (moins d'inférence & heuristiques allégées)
INFER_TYPES = not QUICK_MODE   # on désactive l'inférence en mode rapide
VERBOSE = True                 # <== afficher le détail des étapes

# Limiteurs (pour gros dossiers/fichiers)
MAX_FILES = None               # ex: 5 pour limiter aux 5 premiers fichiers
MAX_SHEETS = None              # ex: 3 pour limiter aux 3 premiers onglets par fichier
SCAN_MAX_ROWS = 8000           # n-rows max par feuille (None = tout lire)
MAX_TABLES_PER_SHEET = 2       # None = pas de limite

# Heuristiques génériques (vertical = par lignes)
MIN_COLS = 2
MIN_CONSEC_ROWS = 5 if not QUICK_MODE else 3
ROW_EMPTY_TOL = 1
STOP_EMPTY_RUN = 5 if not QUICK_MODE else 3
HEADER_SCAN_DEPTH = 6 if not QUICK_MODE else 3

# >>> NEW: segmentation verticale (pour tables côte-à-côte)
COL_DENSITY_THRESHOLD = 0.12   # min ratio de non-nuls pour considérer une colonne "utilisée"
MIN_COL_RUN = 2                # nb min de colonnes consécutives pour former un segment vertical
ALLOW_SMALL_GAPS = 1           # tolère jusqu'à N colonnes vides à l'intérieur d'un segment
MAX_SEGMENTS_PER_SHEET = None  # limiter le nombre de segments par feuille (None = illimité)

# Filtrage colonnes “génériques”
MIN_NON_NULL_RATIO = 0.05
MIN_NON_NULL_ABS   = 2
GENERIC_COL_RE = re.compile(r"^col(_\d+)?$", re.I)

# Inférence (si activée)
DATE_NAME_HINTS = ("date", "dt_", "_dt", "attribution", "retrait", "month", "mois")
DATE_TOKEN_RE = re.compile(
    r"[/\-.]|(?:jan|feb|mar|apr|mai|may|jun|jul|aug|sep|oct|nov|dec|"
    r"janv|févr|fevr|avr|juil|sept|oct|nov|déc|dec)",
    re.I
)
BOOL_TRUE  = {"true","vrai","oui","y","1"}
BOOL_FALSE = {"false","faux","non","n","0"}
PRE_PARSE_TOKEN_RATIO_IF_NAME   = 0.10
PRE_PARSE_TOKEN_RATIO_NO_NAME   = 0.30

# -------------------- logging utils -------------------- #
def nowstr(): return datetime.now().strftime("%H:%M:%S")
def log(msg):
    if VERBOSE:
        print(f"[{nowstr()}] {msg}", flush=True)

def step_time(prev=None):
    t = time.perf_counter()
    if prev is None:
        return t, 0.0
    return t, (t - prev)

# -------------------- utils noms -------------------- #
def strip_accents_lower(s: str) -> str:
    if s is None or pd.isna(s): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def safe_id(s: str) -> str:
    """Normalise un nom (fichier/onglet) pour un identifiant exploitable: minuscules, accents out, non-alnum->'_'."""
    s = strip_accents_lower(s)
    s = re.sub(r"[\s\.\-]+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s or "sheet"

def is_generic_colname(name: str) -> bool:
    return bool(GENERIC_COL_RE.fullmatch(name or ""))

# ---------- Lecture Excel rapide ----------
def safe_read_excel_all_sheets(path: Path):
    """
    .xlsx/.xlsm: openpyxl ; .xlsb: pyxlsb
    Si SCAN_MAX_ROWS est défini, on tronque la lecture à SCAN_MAX_ROWS pour accélérer.
    """
    ext = path.suffix.lower()
    if ext in (".xlsx", ".xlsm"):
        engine = "openpyxl"
    elif ext == ".xlsb":
        engine = "pyxlsb"
    else:
        raise ValueError(f"Extension Excel non gérée: {ext}")

    log(f"  -> Ouverture Excel ({engine})")
    t0, _ = step_time()
    xls = pd.ExcelFile(path, engine=engine)
    sheets = xls.sheet_names[:(MAX_SHEETS or len(xls.sheet_names))]
    log(f"  -> Feuilles détectées: {len(sheets)} (limite={MAX_SHEETS or 'Aucune'})")

    out = []
    nrows = SCAN_MAX_ROWS if SCAN_MAX_ROWS is not None else None
    for i, sheet in enumerate(sheets, 1):
        ti, _ = step_time()
        log(f"    .. Lecture feuille {i}/{len(sheets)}: '{sheet}' nrows={nrows or 'ALL'}")
        df = pd.read_excel(xls, sheet_name=sheet, header=None, engine=engine, nrows=nrows)
        _, dt = step_time(ti)
        log(f"       -> taille: {df.shape[0]}x{df.shape[1]} (en {dt:.2f}s)")
        out.append((sheet, df))
    _, dt_total = step_time(t0)
    log(f"  -> Excel chargé en {dt_total:.2f}s")
    return out

# ---------- Lecture CSV rapide ----------
COMMON_SEPS = [",", ";", "\t", "|"]
COMMON_ENCODINGS = ["utf-8-sig", "utf-8", "cp1252", "latin-1"]

def guess_sep(first_line: str):
    counts = {sep: first_line.count(sep) for sep in COMMON_SEPS}
    sep = max(counts, key=counts.get)
    return sep if counts[sep] > 0 else ","

def read_csv_fast(path: Path):
    t0, _ = step_time()
    log("  -> Détection encodage/séparateur (rapide)")
    enc_used = None
    sep_used = ","
    tried = []
    try:
        for enc in COMMON_ENCODINGS:
            tried.append(enc)
            with open(path, "r", encoding=enc, errors="strict") as f:
                head = f.readline()
            enc_used = enc
            sep_used = guess_sep(head)
            break
    except Exception:
        enc_used = "latin-1"
        with open(path, "r", encoding=enc_used, errors="replace") as f:
            head = f.readline()
        sep_used = guess_sep(head)
    log(f"     -> encodage: {enc_used} (essais={tried}); sep='{sep_used}'")

    nrows = SCAN_MAX_ROWS if SCAN_MAX_ROWS is not None else None
    log(f"  -> Lecture CSV nrows={nrows or 'ALL'}")
    df = pd.read_csv(path, sep=sep_used, encoding=enc_used, header=None, nrows=nrows)
    _, dt = step_time(t0)
    log(f"     -> taille: {df.shape[0]}x{df.shape[1]} (en {dt:.2f}s)")
    return df

# ---------- Pré-traitements & détection ----------
def ensure_range_columns(df: pd.DataFrame):
    if list(df.columns) != list(range(df.shape[1])):
        df = df.copy()
        df.columns = list(range(df.shape[1]))
    return df

def detect_blocks(df: pd.DataFrame):
    """
    Renvoie les (start, end) de runs True dans le masque tabulaire (>= MIN_COLS non nuls).
    Version vectorisée très rapide.
    """
    t0, _ = step_time()
    df = ensure_range_columns(df)
    row_nnz = df.notna().sum(axis=1).to_numpy()
    tab = row_nnz >= MIN_COLS
    blocks = []
    if tab.any():
        padded = np.r_[False, tab, False]
        starts = np.where((~padded[:-1]) & (padded[1:]))[0]
        ends   = np.where((padded[:-1]) & (~padded[1:]))[0] - 1
        for s, e in zip(starts, ends):
            if (e - s + 1) >= MIN_CONSEC_ROWS:
                blocks.append((df.index[s], df.index[e]))
                if QUICK_MODE and MAX_TABLES_PER_SHEET and len(blocks) >= MAX_TABLES_PER_SHEET:
                    break
    _, dt = step_time(t0)
    log(f"    .. Blocs tabulaires détectés (vertical/lignes): {len(blocks)} (en {dt:.2f}s)")
    return blocks

# ---------- Segmentation verticale (tables côte-à-côte) ----------
def split_by_vertical_gaps(df: pd.DataFrame):
    """
    Retourne une liste de segments (sub_df, start_col, end_col) correspondant à des groupes de colonnes "denses".
    """
    if df.empty or df.shape[1] <= 1:
        return [(df, 0, df.shape[1]-1)] if df.shape[1] else []

    dens = df.notna().mean(axis=0).to_numpy()  # ratio non-null par colonne
    used = dens >= COL_DENSITY_THRESHOLD

    # ponte les petits trous entre colonnes utilisées
    if ALLOW_SMALL_GAPS > 0:
        n = len(used)
        i = 0
        while i < n:
            if not used[i]:
                j = i
                while j < n and not used[j]:
                    j += 1
                gap_len = j - i
                left_used = (i-1 >= 0 and used[i-1])
                right_used = (j < n and used[j])
                if left_used and right_used and gap_len <= ALLOW_SMALL_GAPS:
                    used[i:j] = True
                i = j
            else:
                i += 1

    segments = []
    padded = np.r_[False, used, False]
    starts = np.where((~padded[:-1]) & (padded[1:]))[0]
    ends   = np.where((padded[:-1]) & (~padded[1:]))[0] - 1

    for s, e in zip(starts, ends):
        if (e - s + 1) >= MIN_COL_RUN:
            sub = df.iloc[:, s:e+1]
            segments.append((sub, s, e))

    return segments if segments else [(df, 0, df.shape[1]-1)]

# ---------- Colonnes ----------
def choose_header_row(block: pd.DataFrame):
    best_idx, best_score = None, -1
    limit = min(len(block), HEADER_SCAN_DEPTH)
    for i in range(limit):
        row = block.iloc[i]
        vals = row.tolist()
        non_empty = sum(pd.notna(v) and str(v).strip() != "" for v in vals)
        texty = 0
        for v in vals:
            s = str(v).strip() if pd.notna(v) else ""
            if s and not s.lower().startswith("unnamed") and re.search(r"[A-Za-zÀ-ÿ]", s):
                texty += 1
        score = non_empty * 2 + texty
        if score > best_score:
            best_score, best_idx = score, i
    return best_idx or 0

def clean_columns(vals):
    cols, seen = [], {}
    for v in vals:
        s = strip_accents_lower(v).replace("\n", " ")
        s = re.sub(r"\s+", " ", s).strip(" -_")
        if not s or s.startswith("unnamed"):
            s = "col"
        s = re.sub(r"[^a-z0-9_ ]", "", s)
        s = re.sub(r"\s+", "_", s).strip("_") or "col"
        if s in seen:
            seen[s] += 1
            s = f"{s}_{seen[s]}"
        else:
            seen[s] = 1
        cols.append(s)
    return cols

def prune_columns(df: pd.DataFrame, header_keep: set[str]) -> pd.DataFrame:
    if df.empty:
        return df
    keep = list(header_keep)
    n = len(df)
    for c in df.columns:
        if c in header_keep:
            continue
        if not is_generic_colname(c):
            keep.append(c)
            continue
        nnz = df[c].notna().sum()
        if nnz >= max(MIN_NON_NULL_ABS, int(n * MIN_NON_NULL_RATIO)):
            keep.append(c)
    keep_ordered = [c for c in df.columns if c in keep]
    return df[keep_ordered].copy()

# ---------- Inférence de types (optionnelle) ----------
def infer_and_cast_column(s: pd.Series, col_name: str) -> tuple[pd.Series, str]:
    if not INFER_TYPES:
        return s.astype("string"), "STRING"
    name_norm = strip_accents_lower(col_name)
    looks_like_date_name = any(h in name_norm for h in DATE_NAME_HINTS)
    s_obj = s.astype("string")
    non_empty = s_obj.dropna()

    numeric_only_ratio = 0.0
    if len(non_empty) > 0:
        numeric_only_ratio = sum(bool(re.fullmatch(r"\d+(?:[.,]\d+)?", str(x).strip()))
                                 for x in non_empty) / len(non_empty)

    date_token_ratio = 0.0
    if len(non_empty) > 0:
        date_token_ratio = sum(bool(DATE_TOKEN_RE.search(str(x)))
                               for x in non_empty) / len(non_empty)

    # tentative dates
    should_try_parse = (
        (looks_like_date_name and date_token_ratio >= PRE_PARSE_TOKEN_RATIO_IF_NAME) or
        ((not looks_like_date_name) and date_token_ratio >= PRE_PARSE_TOKEN_RATIO_NO_NAME and numeric_only_ratio < 0.80)
    )
    if should_try_parse:
        parsed_dates = pd.to_datetime(s, errors="coerce", dayfirst=True)
        date_ratio = parsed_dates.notna().sum() / max(1, s.notna().sum())
        accept_date = (date_ratio >= 0.30) if looks_like_date_name else (date_ratio >= 0.70)
        if accept_date:
            return parsed_dates.dt.normalize(), "DATE"

    # numériques
    as_num = pd.to_numeric(s_obj.str.replace("\u00A0"," ", regex=False)
                               .str.replace(" ","", regex=False)
                               .str.replace(",",".", regex=False), errors="coerce")
    num_ratio = as_num.notna().sum() / max(1, s.notna().sum())
    if num_ratio >= 0.85:
        as_int = as_num.dropna()
        if len(as_int) == 0:
            return as_num.astype("Float64"), "FLOAT"
        if (as_int % 1 == 0).all():
            return as_num.astype("Int64"), "INT"
        else:
            return as_num.astype("Float64"), "FLOAT"

    # bool
    vals = non_empty.map(strip_accents_lower).unique().tolist()
    if 1 <= len(set(vals)) <= 3:
        mapped = s_obj.map(strip_accents_lower)
        def map_bool(x):
            if x in BOOL_TRUE: return True
            if x in BOOL_FALSE: return False
            return pd.NA
        mb = mapped.map(map_bool)
        if mb.notna().sum() / max(1, mapped.notna().sum()) >= 0.9:
            return mb.astype("boolean"), "BOOL"

    return s_obj, "STRING"

def infer_types_df(df: pd.DataFrame):
    t0, _ = step_time()
    out = df.copy()
    schema = {}
    for c in out.columns:
        out[c], t = infer_and_cast_column(out[c], c)
        schema[c] = t
    _, dt = step_time(t0)
    log(f"    .. Inférence types: {len(out.columns)} colonnes (en {dt:.2f}s, activée={INFER_TYPES})")
    return out, schema

def show_schema(df: pd.DataFrame, schema: dict):
    print("\n=== schema ===", flush=True)
    for c in df.columns:
        t = schema.get(c, str(df[c].dtype)).upper()
        print(f"{c}: {t}", flush=True)

# ---------- Extraction d'une table sur un bloc (lignes) ----------
def carve_table_from_block(df_block: pd.DataFrame):
    if df_block.empty:
        return None

    log("    .. Sélection header")
    h_rel = choose_header_row(df_block)
    raw_header_vals = df_block.iloc[h_rel].tolist()
    cols = clean_columns(raw_header_vals)

    data = df_block.iloc[h_rel+1:].copy()
    data.columns = cols
    log(f"    .. Header choisi ligne relative {h_rel} -> {len(cols)} colonnes")

    # stop à N lignes vides consécutives
    empty_run = 0
    cut_idx = data.index[-1]
    for idx in data.index:
        if data.loc[idx].isna().all():
            empty_run += 1
            if empty_run >= STOP_EMPTY_RUN:
                cut_idx = idx - STOP_EMPTY_RUN
                break
        else:
            empty_run = 0

    data = data.loc[:cut_idx]
    data = data[data.notna().sum(axis=1) >= MIN_COLS]
    log(f"    .. Après nettoyage lignes: {data.shape[0]}x{data.shape[1]}")

    header_keep = {c for c in cols if not is_generic_colname(c)}
    data = prune_columns(data, header_keep)
    log(f"    .. Après prune colonnes: {data.shape[0]}x{data.shape[1]}")
    if data.empty:
        return None

    data, schema = infer_types_df(data)

    # normalisation visuelle dates si inférées
    if INFER_TYPES:
        for c, t in schema.items():
            if t == "DATE":
                try:
                    data[c] = pd.to_datetime(data[c], errors="coerce").dt.strftime("%Y-%m-%d")
                except Exception:
                    pass

    return data.reset_index(drop=True), schema

# ---------- Trouver tables dans une FEUILLE (segments horizontaux + blocs verticaux) ----------
def find_tables_in_sheet(df_raw: pd.DataFrame):
    print("  -> Détection des blocs tabulaires…", flush=True)
    df_raw = ensure_range_columns(df_raw)
    if SCAN_MAX_ROWS is not None and len(df_raw) > SCAN_MAX_ROWS:
        df_raw = df_raw.iloc[:SCAN_MAX_ROWS, :]
        log(f"     (tronqué à {SCAN_MAX_ROWS} lignes)")

    # Segments verticaux (tables côte-à-côte)
    segments = split_by_vertical_gaps(df_raw)
    if MAX_SEGMENTS_PER_SHEET:
        segments = segments[:MAX_SEGMENTS_PER_SHEET]
    log(f"  -> Segments verticaux: {len(segments)}")

    tables = []
    table_count = 0
    for seg_idx, (seg_df, c0, c1) in enumerate(segments, 1):
        log(f"  -> Segment {seg_idx}: cols {c0}..{c1} (shape {seg_df.shape[0]}x{seg_df.shape[1]})")
        # blocs verticaux (par lignes) à l'intérieur du segment
        blocks = detect_blocks(seg_df)
        for k, (start, end) in enumerate(blocks, 1):
            log(f"     -> Carve table {k}/{len(blocks)} dans segment {seg_idx} (rows {start}..{end})")
            block = seg_df.loc[start:end, :]
            carved = carve_table_from_block(block)
            if carved is None:
                log("        .. ignoré (vide après carve)")
                continue
            table, schema = carved
            if table is not None and table.shape[1] >= MIN_COLS and table.shape[0] >= 1:
                tables.append((seg_idx, k, table, schema))
                table_count += 1
                log(f"        .. table retenue: {table.shape[0]}x{table.shape[1]}")
                if QUICK_MODE and MAX_TABLES_PER_SHEET and table_count >= MAX_TABLES_PER_SHEET:
                    log("        .. limite de tables atteinte (mode rapide)")
                    return tables
    log(f"  -> Tables retenues sur la feuille: {len(tables)}")
    return tables

# ---------- Par fichier ----------
def process_file(path: Path):
    results = []  # tuples: (sheet_name, seg_idx, block_idx, df, schema)
    ext = path.suffix.lower()
    if ext in (".xlsx", ".xlsm", ".xlsb"):
        for sheet, df_raw in safe_read_excel_all_sheets(path):
            log(f"-- Feuille: {sheet} | taille {df_raw.shape[0]}x{df_raw.shape[1]}")
            for (seg_i, blk_i, t, sc) in find_tables_in_sheet(df_raw):
                results.append((sheet, seg_i, blk_i, t, sc))
    elif ext == ".csv":
        df_raw = read_csv_fast(path)
        log(f"-- CSV lu | taille {df_raw.shape[0]}x{df_raw.shape[1]}")
        for (seg_i, blk_i, t, sc) in find_tables_in_sheet(df_raw):
            results.append((None, seg_i, blk_i, t, sc))
    else:
        raise ValueError(f"Extension non gérée: {ext}")
    return results

# ---------- main ----------
def main():
    print("=== data header: folder info ===", flush=True)
    print(f"path: {ROOT_DIR.resolve()}", flush=True)

    files = []
    for pat in ("*.xlsx", "*.xlsm", "*.xlsb", "*.csv"):
        files += [p for p in ROOT_DIR.rglob(pat) if p.is_file() and not p.name.startswith("~$")]
    files = sorted(files)
    if MAX_FILES:
        files = files[:MAX_FILES]
    print(f"files_found: {len(files)}", flush=True)

    print("\n=== config ===", flush=True)
    print(f"QUICK_MODE={QUICK_MODE} | INFER_TYPES={INFER_TYPES} | VERBOSE={VERBOSE}", flush=True)
    print(f"SCAN_MAX_ROWS={SCAN_MAX_ROWS} | MAX_FILES={MAX_FILES} | MAX_SHEETS={MAX_SHEETS} | MAX_TABLES_PER_SHEET={MAX_TABLES_PER_SHEET}", flush=True)
    print(f"MIN_COLS={MIN_COLS} | MIN_CONSEC_ROWS={MIN_CONSEC_ROWS} | HEADER_SCAN_DEPTH={HEADER_SCAN_DEPTH}", flush=True)
    print(f"COL_DENSITY_THRESHOLD={COL_DENSITY_THRESHOLD} | MIN_COL_RUN={MIN_COL_RUN} | ALLOW_SMALL_GAPS={ALLOW_SMALL_GAPS}", flush=True)

    if not files:
        print("[warn] Aucun fichier trouvé.", flush=True); return

    total_tables = 0
    total_sheets = 0

    for f in files:
        print("\n=== file ===", flush=True)
        print(f"{f.name}  ({f.resolve()})", flush=True)
        t0 = time.perf_counter()
        try:
            tables = process_file(f)
        except Exception as e:
            print(f"[error] Lecture échouée pour {f.name}: {e}", flush=True)
            continue

        # comptage feuilles approx
        sheets_in_file = len({s for (s, *_rest) in tables if s is not None})
        total_sheets += sheets_in_file
        total_tables += len(tables)

        if not tables:
            print("[info] Aucune table détectée", flush=True)
        else:
            file_id = safe_id(f.stem)
            for (sheet, seg_i, blk_i, df, schema) in tables:
                sheet_id = safe_id(sheet or "sheet")
                table_idx = f"{seg_i}_{blk_i}"  # segment X, bloc Y
                table_name = f"{file_id}.{sheet_id}.table_{table_idx}"
                print(f"\n--- table detected ---", flush=True)
                print(f"name: {table_name}", flush=True)  # <<<< nommage exploitable nom_fic.nom_onglet.table_X_Y
                print(f"rows: {len(df)} | cols: {df.shape[1]}", flush=True)
                print("columns:", ", ".join(map(str, df.columns.tolist())), flush=True)
                show_schema(df, schema)
                if SHOW_SAMPLES:
                    with pd.option_context("display.max_columns", 80, "display.width", 200):
                        print("\n=== sample (top 8) ===", flush=True)
                        print(df.head(8), flush=True)

        t1 = time.perf_counter()
        print(f"[info] Temps fichier: {t1 - t0:.2f}s", flush=True)

    print("\n=== done ===", flush=True)
    print(f"Tables: {total_tables} | Feuilles (approx): {total_sheets} | Fichiers: {len(files)}", flush=True)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n[info] Interrompu par l'utilisateur.", flush=True)
        sys.exit(130)


=== data header: folder info ===
path: C:\globasoft\aerotech\fic
files_found: 1

=== config ===
QUICK_MODE=True | INFER_TYPES=False | VERBOSE=True
SCAN_MAX_ROWS=8000 | MAX_FILES=None | MAX_SHEETS=None | MAX_TABLES_PER_SHEET=2
MIN_COLS=2 | MIN_CONSEC_ROWS=3 | HEADER_SCAN_DEPTH=3
COL_DENSITY_THRESHOLD=0.12 | MIN_COL_RUN=2 | ALLOW_SMALL_GAPS=1

=== file ===
Tauxdechange.xlsx  (C:\globasoft\aerotech\fic\Tauxdechange.xlsx)
[14:46:04]   -> Ouverture Excel (openpyxl)
[14:46:04]   -> Feuilles détectées: 2 (limite=Aucune)
[14:46:04]     .. Lecture feuille 1/2: '2024' nrows=8000
[14:46:04]        -> taille: 257x12 (en 0.08s)
[14:46:04]     .. Lecture feuille 2/2: '2025' nrows=8000
[14:46:04]        -> taille: 129x12 (en 0.05s)
[14:46:04]   -> Excel chargé en 0.70s
[14:46:04] -- Feuille: 2024 | taille 257x12
  -> Détection des blocs tabulaires…
[14:46:04]   -> Segments verticaux: 1
[14:46:04]   -> Segment 1: cols 0..11 (shape 257x12)
[14:46:04]     .. Blocs tabulaires détectés (vertical/lignes): 