In [None]:
pip install pyxlsb
pip install sqlalchemy psycopg2-binary


# Connexion BDD

In [None]:
%pip install -q "sqlalchemy>=2" psycopg2-binary

In [1]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError
from urllib.parse import quote_plus

PG_HOST = "dpg-d3jq6apr0fns738f81i0-a.frankfurt-postgres.render.com"
PG_PORT = 5432
PG_DB   = "aerotec_datawarehouse"
PG_USER = "aerotec_datawarehouse_user"
PG_PASS = "LHTYZJ3aUDI8IeylbA1SZs9M9TsKQ4To"

conn_str = (
    f"postgresql+psycopg2://{PG_USER}:{quote_plus(PG_PASS)}@{PG_HOST}:{PG_PORT}/{PG_DB}"
)

# Astuces de stabilit√© pour Render :
# - sslmode=require (obligatoire)
# - connect_timeout : √©vite de bloquer si le r√©seau/host ne r√©pond pas
# - keepalives_* : √©vite les coupures silencieuses d‚Äôidle
engine = create_engine(
    conn_str,
    connect_args={
        "sslmode": "require",
        "connect_timeout": 5,
        "keepalives": 1,
        "keepalives_idle": 30,
        "keepalives_interval": 10,
        "keepalives_count": 3,
    },
    pool_pre_ping=True,     # ping avant r√©utilisation du pool
    pool_recycle=1800,      # recycle connexions > 30 min
    pool_size=5,
    max_overflow=5,
)

# Test rapide de connectivit√© (√† garder au d√©but du main)
try:
    with engine.connect() as conn:
        ver = conn.execute(text("select version();")).scalar()
        print("[ok] Connect√© √† PostgreSQL Render")
        print(ver)
except OperationalError as e:
    print("[error] Connexion PostgreSQL √©chou√©e:", e)
    raise


[ok] Connect√© √† PostgreSQL Render
PostgreSQL 17.6 (Debian 17.6-1.pgdg12+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14+deb12u1) 12.2.0, 64-bit


 # fichier qualit√© des trigrammes.xlsx

In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import os
import pandas as pd
import numpy as np
import re, unicodedata, warnings

# --- DB: SQLAlchemy
from sqlalchemy import create_engine, text
from sqlalchemy import types as satypes
from sqlalchemy.exc import OperationalError
from urllib.parse import quote_plus

# ================== CONFIG ================== #
FILE_PATH = r"C:\globasoft\aerotech\fic\fichier qualit√© des trigrammes.xlsx"
SHOW_SAMPLE = True
TARGET_SCHEMA = "public"          # sch√©ma cible
IF_EXISTS_MODE = "replace"        # "replace" ou "append"

# Forcer certains noms de colonnes en DATE/DATETIME (optionnel)
FORCE_DATE_COLS = {"date_d_attribution", "date_de_retrait"}  # snake_case attendu

# ================== ACC√àS DB ================== #
REQUIRED_KEYS = ("PG_HOST","PG_PORT","PG_DB","PG_USER","PG_PASS")

def _collect_pg_config():
    """R√©cup√®re les creds depuis le 1er bloc (globals) OU depuis l'env."""
    cfg = {}
    g = globals()
    # 1) variables d√©j√† d√©finies dans le 1er bloc ?
    for k in REQUIRED_KEYS:
        if k in g and g[k]:
            cfg[k] = g[k]
    # 2) sinon variables d'environnement ?
    for k in REQUIRED_KEYS:
        if k not in cfg or not cfg[k]:
            v = os.getenv(k)
            if v:
                cfg[k] = v

    # d√©fauts pratiques si seul le host Render est utilis√©
    if "PG_HOST" not in cfg or not cfg["PG_HOST"]:
        cfg["PG_HOST"] = "dpg-d3jq6apr0fns738f81i0-a.frankfurt-postgres.render.com"
    if "PG_PORT" not in cfg or not cfg["PG_PORT"]:
        cfg["PG_PORT"] = 5432
    else:
        cfg["PG_PORT"] = int(cfg["PG_PORT"])

    missing = [k for k in REQUIRED_KEYS if k not in cfg or cfg[k] in ("", None)]
    if missing:
        raise RuntimeError(
            "Param√®tres DB manquants: "
            + ", ".join(missing)
            + ". D√©finis-les dans le 1er bloc (PG_*) ou via variables d'environnement."
        )
    return cfg

def get_engine():
    """R√©utilise l'engine existant si pr√©sent. Sinon, en cr√©e un avec SSL Render."""
    g = globals()
    if "engine" in g and g["engine"] is not None:
        return g["engine"]

    cfg = _collect_pg_config()
    conn_str = (
        f"postgresql+psycopg2://{cfg['PG_USER']}:{quote_plus(cfg['PG_PASS'])}"
        f"@{cfg['PG_HOST']}:{cfg['PG_PORT']}/{cfg['PG_DB']}"
    )
    eng = create_engine(
        conn_str,
        connect_args={
            "sslmode": "require",
            "connect_timeout": 5,
            "keepalives": 1,
            "keepalives_idle": 30,
            "keepalives_interval": 10,
            "keepalives_count": 3,
        },
        pool_pre_ping=True,
        pool_recycle=1800,
        pool_size=5,
        max_overflow=5,
    )
    return eng

# ================== UTILS ================== #
def strip_accents_lower(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def snake_id(s):
    s = strip_accents_lower(s)
    s = re.sub(r"\s+", " ", s.strip())           # normalise espaces
    s = re.sub(r"[\s\.\-]+", "_", s)             # espaces/points/tirets -> _
    s = re.sub(r"[^a-z0-9_]", "_", s)            # autres -> _
    s = re.sub(r"_+", "_", s).strip("_") or "col"
    return s

def pg_ident(name: str, maxlen=63) -> str:
    """PostgreSQL ident max 63 chars."""
    return name[:maxlen]

def parse_number_like(s: pd.Series) -> pd.Series:
    # g√®re espace normal, ins√©cable et fine ins√©cable
    x = s.astype("string")\
         .str.replace("\u00A0", " ", regex=False)\
         .str.replace("\u202F", " ", regex=False)\
         .str.replace(" ", "", regex=False)
    x = x.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))", "", regex=True)\
         .str.replace(",", ".", regex=False)
    return pd.to_numeric(x, errors="coerce")

# --- header smarter ---
HEADER_KEYWORDS = (
    "date", "motif", "r√©dact", "redact", "trig", "nom", "pr√©nom", "prenom",
    "personnel", "site", "retrait", "√©dition", "edition"
)

def choose_header_row(df: pd.DataFrame, scan=25) -> int:
    limit = min(len(df), scan)
    header_candidate_idx = None
    header_candidate_hits = -1
    for i in range(limit):
        row = df.iloc[i].astype(str)
        hits = 0
        for v in row:
            t = strip_accents_lower(v)
            if t and any(k in t for k in HEADER_KEYWORDS):
                hits += 1
        if hits >= 2 and hits > header_candidate_hits:
            header_candidate_hits = hits
            header_candidate_idx = i
    if header_candidate_idx is not None:
        return header_candidate_idx
    best, idx = -1, 0
    for i in range(limit):
        row = df.iloc[i].astype(str)
        non_empty = row.map(lambda x: x.strip()!="").sum()
        texty = row.map(lambda x: bool(re.search(r"[A-Za-z√Ä-√ø]", x))).sum()
        score = non_empty*2 + texty
        if score>best: best, idx = score, i
    return idx

def normalize_columns(header_vals):
    cols, seen = [], {}
    for v in header_vals:
        s = snake_id(v) if (v is not None and str(v).strip()!="") else "col"
        seen[s] = seen.get(s,0)+1
        cols.append(s if seen[s]==1 else f"{s}_{seen[s]}")
    return cols

def drop_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=1, how="all")
    blank_cols = [c for c in df.columns
                  if df[c].isna().all() or df[c].astype(str).str.strip().eq("").all()]
    return df.drop(columns=blank_cols) if blank_cols else df

# --- inf√©rence minimale robuste ---
MAP_TRUE  = {"true","vrai","oui","yes","1","y","o"}
MAP_FALSE = {"false","faux","non","no","0","n"}

DATE_TOKEN_RE = re.compile(
    r"(?:\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})|(?:\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4})|"
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|janv|f√©vr|fevr|avr|mai|juin|juil|sept|oct|nov|d[√©e]c)",
    re.I
)
ISO_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$")
COLNAME_DATE_HINTS = ("date", "dt", "heure", "time")

def infer_col(s: pd.Series):
    s = s.copy()
    ss = s.astype("string").str.strip()
    ss = ss.where(~ss.fillna("").eq("0"), pd.NA)  # √©viter 0 -> 1970-01-01

    # üîí For√ßage par nom de colonne (date_de_retrait, date_d_attribution) sans warning
    if snake_id(s.name) in FORCE_DATE_COLS:
        val = ss.fillna("")
        iso_mask = val.str.match(ISO_DATE_RE)
        if iso_mask.mean() >= 0.5:
            # Majoritairement ISO -> pas de warning
            dt = pd.to_datetime(ss, errors="coerce", dayfirst=False)
        else:
            # Essaie les deux et garde le meilleur parse
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
                dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
                dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
            dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1

        valid_dt = dt.dropna()
        has_time = (len(valid_dt) and (valid_dt.dt.time.astype(str) != "00:00:00").mean() > 0.2)
        return (dt, "DATETIME" if has_time else "DATE")

    # 0) ISO
    iso_mask = ss.fillna("").str.match(ISO_DATE_RE)
    if iso_mask.mean() >= 0.5:
        dt_iso = pd.to_datetime(ss.where(iso_mask), errors="coerce", dayfirst=False)
        ok = dt_iso.notna().mean() if len(ss) else 0.0
        if ok >= 0.5:
            valid_dt = dt_iso.dropna()
            has_time = (len(valid_dt) and (valid_dt.dt.time.astype(str) != "00:00:00").mean() > 0.2)
            return (dt_iso, "DATETIME" if has_time else "DATE")

    # 1) Excel serial
    as_num = pd.to_numeric(ss, errors="coerce")
    frac_num = as_num.notna().mean() if len(ss) else 0.0
    if frac_num >= 0.9:
        mask = as_num.between(60, 2950000)
        parsed = pd.to_datetime(as_num.where(mask), unit="D", origin="1899-12-30", errors="coerce")
        ok = parsed.notna().mean() if len(ss) else 0.0
        if ok >= 0.7:
            valid_dt = parsed.dropna()
            has_time = (len(valid_dt) and (valid_dt.dt.time.astype(str) != "00:00:00").mean() > 0.2)
            return (parsed, "DATETIME" if has_time else "DATE")

    # 2) tokens / nom de colonne (tol√©rant aux colonnes clairsem√©es)
    colname_hint = any(h in strip_accents_lower(s.name or "") for h in COLNAME_DATE_HINTS)
    date_token_ratio = ss.fillna("").str.contains(DATE_TOKEN_RE, na=False).mean()
    looks_datey = colname_hint or (date_token_ratio >= 0.30)
    if looks_datey:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
            dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
            dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
        dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1

        n_all = len(ss)
        n_parsed = int(dt.notna().sum())
        nonempty_mask = ss.fillna("").str.strip().ne("")
        n_nonempty = int(nonempty_mask.sum())

        ok_abs = (n_parsed / n_all) if n_all else 0.0
        ok_nonempty = (n_parsed / n_nonempty) if n_nonempty else 0.0

        if colname_hint:
            accept = (n_parsed >= 3 and ok_nonempty >= 0.80) or (n_parsed >= 2 and ok_abs >= 0.20)
        else:
            accept = ok_abs >= 0.70

        if accept:
            valid_dt = dt.dropna()
            has_time = (len(valid_dt) and (valid_dt.dt.time.astype(str) != "00:00:00").mean() > 0.2)
            return (dt, "DATETIME" if has_time else "DATE")

    # 3) bool strict
    mb = ss.str.lower().map(lambda x: True if x in MAP_TRUE else (False if x in MAP_FALSE else pd.NA))
    if (mb.notna().mean() if len(ss) else 0) >= 0.98:
        return mb.astype("boolean"), "BOOL"

    # 4) nombre
    nums = parse_number_like(ss)
    if (nums.notna().mean() if len(ss.dropna()) else 0) >= 0.85:
        nz = nums.dropna()
        if len(nz) and np.isclose(nz, np.round(nz)).all():
            return nums.round().astype("Int64"), "INT"
        return nums.astype("Float64"), "FLOAT"

    # 5) texte
    return ss.astype("string"), "STRING"

def pg_type(tag: str):
    return {
        "DATE": satypes.Date(),
        "DATETIME": satypes.DateTime(),  # timestamp without time zone
        "INT": satypes.Integer(),
        "FLOAT": satypes.Float(precision=53),
        "BOOL": satypes.Boolean(),
        "STRING": satypes.Text()
    }.get(tag, satypes.Text())

# ================== DB WRITE ================== #
def write_df_to_postgres(engine, df: pd.DataFrame, table_name: str, schema_tags: dict):
    # cast pandas ‚Üí types s√ªrs
    for c, tag in schema_tags.items():
        if tag == "DATE":
            df[c] = pd.to_datetime(df[c], errors="coerce").dt.date
        elif tag == "DATETIME":
            df[c] = pd.to_datetime(df[c], errors="coerce")

    dtype_map = {c: pg_type(tag) for c, tag in schema_tags.items()}

    df.to_sql(
        name=table_name,
        con=engine,
        schema=TARGET_SCHEMA,
        if_exists=IF_EXISTS_MODE,   # "replace" ou "append"
        index=False,
        dtype=dtype_map,
        method="multi",
        chunksize=1000
    )

# ================== MAIN ================== #
def main():
    xlsx = Path(FILE_PATH)
    if not xlsx.exists():
        print(f"[error] Fichier introuvable: {xlsx}")
        return

    # Connexion DB (r√©utilise le 1er bloc ou env)
    try:
        engine = get_engine()
        with engine.connect() as conn:
            ver = conn.execute(text("select version();")).scalar()
            print("[ok] Connect√© √† PostgreSQL")
            print(ver)
    except (OperationalError, RuntimeError) as e:
        print("[error] Connexion PostgreSQL √©chou√©e:", e)
        return

    # Lecture multi-feuilles
    sheets = pd.read_excel(xlsx, sheet_name=None, engine="openpyxl", header=None)

    file_base = snake_id(xlsx.stem)
    done = 0

    for name, raw in sheets.items():
        raw = raw.dropna(how="all", axis=0).dropna(how="all", axis=1)
        if raw.empty:
            continue

        h = choose_header_row(raw, scan=25)
        cols = normalize_columns(raw.iloc[h].tolist())
        df = raw.iloc[h+1:].copy()
        df.columns = cols

        df = df.dropna(how="all")
        df = drop_empty_columns(df)
        if df.empty:
            continue

        schema = {}
        for c in df.columns:
            casted, tag = infer_col(df[c])
            df[c] = casted
            schema[c] = tag

        # Nom de table: public.<file> si 1 feuille, sinon public.<file>__<sheet>
        sheet_base = snake_id(name)
        table_name = file_base if len(sheets) == 1 else f"{file_base}__{sheet_base}"
        table_name = pg_ident(table_name)

        print(f"\n--- Feuille: {name} -> {TARGET_SCHEMA}.{table_name} ---")
        print(f"rows={len(df)} | cols={df.shape[1]}")
        print("Colonnes:", ", ".join(df.columns.astype(str)))
        print("\nSchema d√©tect√©:")
        for c in df.columns:
            pgt = type(pg_type(schema[c])).__name__.replace("TypeEngine", "")
            print(f"  - {c}: {schema[c]} -> {pgt}")

        if SHOW_SAMPLE:
            with pd.option_context("display.max_columns", 80, "display.width", 200):
                print("\nSample (top 8):")
                print(df.head(8))

        # √âcriture DB
        try:
            write_df_to_postgres(engine, df.copy(), table_name, schema)
            print(f"[ok] Inserted into {TARGET_SCHEMA}.{table_name}")
            done += 1
        except Exception as e:
            print(f"[error] √âchec insertion {TARGET_SCHEMA}.{table_name}: {e}")

    print(f"\nDone. Feuilles analys√©es & import√©es: {done}")

if __name__ == "__main__":
    main()


[ok] Connect√© √† PostgreSQL
PostgreSQL 17.6 (Debian 17.6-1.pgdg12+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 12.2.0-14+deb12u1) 12.2.0, 64-bit

--- Feuille: PDG -> public.fichier_qualite_des_trigrammes__pdg ---
rows=20 | cols=4
Colonnes: edition, date, motif, redacteur

Schema d√©tect√©:
  - edition: INT -> Integer
  - date: DATE -> Date
  - motif: STRING -> Text
  - redacteur: STRING -> Text

Sample (top 8):
    edition       date                                              motif    redacteur
9         1 2024-06-10                                           Cr√©ation  S. BELMONTE
10        2 2024-09-16                           Ajout nouveaux arrivants    Y. RAGEOT
11        3 2024-10-21                                    Ajout couturier    Y. RAGEOT
12        4 2024-10-28  Mise √† jour des dates d'entr√©e 
Ajout des habi...    Y. RAGEOT
13        5 2024-12-16  Ajout nouvelle arrivante - 1 personne - Feriel...  F.BOULHABEL
14        6 2024-12-18                              D

# Conditionsdepaiement.xlsx

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import re, unicodedata
from datetime import datetime

# ========= CONFIG =========
FILE_PATH = r"C:\globasoft\aerotech\fic\Conditionsdepaiement.xlsx"
SHOW_SAMPLE = True

# ========= LOG =========
def now(): return datetime.now().strftime("%H:%M:%S")
def log(msg): print(f"[{now()}] {msg}", flush=True)

# ========= UTILS =========
def parse_number_like(series: pd.Series) -> pd.Series:
    """Normalise nombres FR/US et convertit en nombre (ou NaN)."""
    x = series.astype("string").str.replace("\u00A0", " ", regex=False).str.replace(" ", "", regex=False)
    x = x.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))", "", regex=True)  # retire s√©parateurs de milliers type 1.234
    x = x.str.replace(",", ".", regex=False)                          # virgule -> point
    return pd.to_numeric(x, errors="coerce")

# ========= MAIN =========
def main():
    xlsx = Path(FILE_PATH)
    log(f"Fichier: {xlsx}")
    if not xlsx.exists():
        log("[error] Fichier introuvable")
        return

    try:
        df0 = pd.read_excel(xlsx, sheet_name=0, header=None, engine="openpyxl")
    except Exception as e:
        log(f"[error] Lecture √©chou√©e: {e}")
        return

    log(f"Feuille[0]: shape={df0.shape}")

    # Purge lignes/colonnes totalement vides
    df = df0.dropna(how="all", axis=0).dropna(how="all", axis=1)
    log(f"Apr√®s drop vides: {df0.shape} -> {df.shape}")
    if df.empty:
        log("Feuille vide apr√®s nettoyage.")
        return

    # On s'attend √† 2 ou 3 colonnes: code, libell√©, (option) d√©lai
    if df.shape[1] >= 3:
        df = df.iloc[:, :3].copy()
        df.columns = ["code_condition", "libelle", "delai_source"]
        log("Colonnes d√©tect√©es: code_condition, libelle, delai_source")
    elif df.shape[1] == 2:
        df = df.iloc[:, :2].copy()
        df.columns = ["code_condition", "libelle"]
        log("Colonnes d√©tect√©es: code_condition, libelle (pas de colonne delai_source)")
    else:
        log("[warn] Moins de 2 colonnes non vides trouv√©es. Abandon.")
        return

    # Types de base + trims
    df["code_condition"] = parse_number_like(df["code_condition"]).astype("Int64")
    df["libelle"] = df["libelle"].astype("string").str.strip()

    # Colonne finale: delai_jours (Int64, nullable)
    df["delai_jours"] = pd.Series(pd.NA, index=df.index, dtype="Int64")

    # Si delai_source existe, l'utiliser (aucune d√©duction depuis libell√©)
    if "delai_source" in df.columns:
        d0 = parse_number_like(df["delai_source"]).astype("Int64")
        n_before = df["delai_jours"].notna().sum()
        df.loc[d0.notna(), "delai_jours"] = d0
        n_after = df["delai_jours"].notna().sum()
        log(f"Remplissage delai_jours depuis delai_source: +{int(n_after - n_before)} lignes")

    # Conserver UNIQUEMENT (code_condition, libelle, delai_jours)
    keep_cols = ["code_condition", "libelle", "delai_jours"]
    df = df[keep_cols]

    # Affichage
    print("\n--- Conditions de paiement (collecte) ---")
    print(f"rows={len(df)} | cols={df.shape[1]}")
    print("Colonnes:", ", ".join(df.columns.astype(str)))

    print("\nSch√©ma final:")
    print("  - code_condition : INT (Int64 nullable)")
    print("  - libelle        : STRING (string)")
    print("  - delai_jours    : INT (Int64 nullable)")

    if SHOW_SAMPLE:
        with pd.option_context("display.max_columns", 100, "display.width", 200):
            print("\nSample (top 15):")
            print(df.head(15))

    return df

if __name__ == "__main__":
    df_conditions = main()


# FNP AEC projets - source.xlsx

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import numpy as np
import re, unicodedata, warnings
from datetime import datetime

# ============== CONFIG ==============
FILE_PATH = r"C:\globasoft\aerotech\fic\FNP AEC projets - source.xlsx"
SHOW_SAMPLE = True

# ============== LOG ==============
def now(): return datetime.now().strftime("%H:%M:%S")
def log(msg): print(f"[{now()}] {msg}", flush=True)

# ============== UTILS ==============
def strip_accents_lower(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def snake_id(s):
    s = strip_accents_lower(s)
    s = re.sub(r"[\s\.\-]+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "_", s)
    return re.sub(r"_+", "_", s).strip("_") or "col"

def parse_number_like(series: pd.Series) -> pd.Series:
    x = series.astype("string").str.replace("\u00A0"," ", regex=False).str.replace(" ","", regex=False)
    x = x.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))","", regex=True)  # retire s√©parateurs de milliers
    x = x.str.replace(",",".", regex=False)                          # virgule -> point
    return pd.to_numeric(x, errors="coerce")

# ============== HEADER PICK + NORMALIZE ==============
HEADER_HINTS = (
    "projet","projets","code","id","date","mois","ann√©e","client","fournisseur","libelle","libell√©",
    "montant","quantite","qt√©","qte","prix","ttc","ht","statut","status","site","type","categorie",
    "echeance","√©ch√©ance","d√©lai","delai","commentaire","ref","r√©f"
)

def choose_header_row(df: pd.DataFrame, scan=25) -> int:
    """1) ligne contenant plusieurs mots-cl√©s; 2) densit√© + pr√©sence de texte."""
    limit = min(len(df), scan)
    cand_idx, cand_hits = None, -1
    for i in range(limit):
        row = df.iloc[i].astype(str)
        hits = 0
        for v in row:
            t = strip_accents_lower(v)
            if t and any(k in t for k in HEADER_HINTS):
                hits += 1
        if hits >= 2 and hits > cand_hits:
            cand_idx, cand_hits = i, hits
    if cand_idx is not None:
        return cand_idx

    best, idx = -1, 0
    for i in range(limit):
        row = df.iloc[i].astype(str)
        non_empty = row.map(lambda x: x.strip()!="").sum()
        texty = row.map(lambda x: bool(re.search(r"[A-Za-z√Ä-√ø]", x))).sum()
        score = non_empty*2 + texty
        if score > best: best, idx = score, i
    return idx

def normalize_columns(header_vals):
    cols, seen = [], {}
    for v in header_vals:
        s = snake_id(v) if (v is not None and str(v).strip()!="") else "col"
        seen[s] = seen.get(s,0)+1
        cols.append(s if seen[s]==1 else f"{s}_{seen[s]}")
    return cols

def drop_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=1, how="all")
    blank_cols = [c for c in df.columns if df[c].dropna().astype(str).str.strip().eq("").all()]
    if blank_cols:
        df = df.drop(columns=blank_cols)
    return df

# ============== INF√âRENCE TYPES (simple et robuste) ==============
MAP_TRUE  = {"true","vrai","oui","yes","1","y","o"}
MAP_FALSE = {"false","faux","non","no","0","n"}

DATE_TOKEN_RE = re.compile(
    r"(?:\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})|(?:\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4})|"
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|janv|f√©vr|fevr|avr|mai|juin|juil|sept|oct|nov|d[√©e]c)",
    re.I
)
ISO_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$")
COLNAME_DATE_HINTS = ("date","dt","heure","time","echeance","√©ch√©ance")

def excel_serial_to_datetime(series: pd.Series) -> pd.Series:
    """
    Conversion s√ªre des num√©ros de date Excel -> datetime.
    Fen√™tre 'safe' contrainte √† [60 .. 106750] jours:
    - 60  ~ 1900-03-01
    - 106750 ~ limite Timedelta pandas (~2173)
    Tout ce qui est hors-fen√™tre -> NaN AVANT conversion.
    """
    vals = pd.to_numeric(series, errors="coerce").astype("float64")
    vals[~np.isfinite(vals)] = np.nan

    lower, upper = 60.0, 106750.0
    mask = (vals >= lower) & (vals <= upper)
    n_out = int((~mask & ~pd.isna(vals)).sum())
    if n_out:
        log(f"[excel_serial_to_datetime] valeurs hors fen√™tre [{int(lower)}, {int(upper)}] remplac√©es par NaN: {n_out}")
    vals = vals.where(mask, np.nan)

    base = pd.Timestamp("1899-12-30")
    # conversion s√ªre (les NaN restent NaT)
    td = pd.to_timedelta(vals, unit="D", errors="coerce")
    dt = base + td
    return pd.to_datetime(dt, errors="coerce")

def infer_col(col: pd.Series):
    s = col.copy()
    ss = s.astype("string").str.strip()
    ss = ss.where(~ss.fillna("").eq("0"), pd.NA)  # √©viter 0 -> 1970-01-01

    # 0) ISO direct
    iso_mask = ss.fillna("").str.match(ISO_DATE_RE)
    if iso_mask.mean() >= 0.5:
        dt_iso = pd.to_datetime(ss.where(iso_mask), errors="coerce", dayfirst=False)
        ok = dt_iso.notna().mean() if len(ss) else 0.0
        if ok >= 0.5:
            has_time = (dt_iso.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                dt_iso.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else dt_iso.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 1) Excel serial plausibles (et majoritairement dans la fen√™tre s√ªre)
    as_num = pd.to_numeric(ss, errors="coerce")
    if len(ss):
        safe_mask = (as_num >= 60) & (as_num <= 106750)
        safe_ratio = safe_mask.mean()
    else:
        safe_ratio = 0.0

    if (as_num.notna().mean() if len(ss) else 0.0) >= 0.9 and safe_ratio >= 0.7:
        parsed = excel_serial_to_datetime(as_num)
        ok = parsed.notna().mean() if len(ss) else 0.0
        if ok >= 0.7:
            has_time = (parsed.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                parsed.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else parsed.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 2) tokens de date / nom ‚Äúdatey‚Äù
    colname_hint = any(h in strip_accents_lower(s.name or "") for h in COLNAME_DATE_HINTS)
    date_token_ratio = ss.fillna("").str.contains(DATE_TOKEN_RE, na=False).mean()
    looks_datey = colname_hint or (date_token_ratio >= 0.30)
    if looks_datey:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
            dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
            dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
        dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1
        ok = dt.notna().mean() if len(ss) else 0.0
        ok_thresh = 0.50 if colname_hint else 0.70
        if ok >= ok_thresh:
            has_time = (dt.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                pd.to_datetime(dt).dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else pd.to_datetime(dt).dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 3) bool strict
    mb = ss.str.lower().map(lambda x: True if x in MAP_TRUE else (False if x in MAP_FALSE else pd.NA))
    if (mb.notna().mean() if len(ss) else 0) >= 0.98:
        return mb.astype("boolean"), "BOOL"

    # 4) nombre
    nums = parse_number_like(ss)
    if (nums.notna().mean() if len(ss.dropna()) else 0) >= 0.85:
        nz = nums.dropna()
        if len(nz) and (np.mod(nz, 1) == 0).all():
            return nums.astype("Int64"), "INT"
        return nums.astype("Float64"), "FLOAT"

    # 5) texte
    return ss.astype("string"), "STRING"

def pg_type(tag: str) -> str:
    return {
        "DATE":"date","DATETIME":"timestamp without time zone",
        "INT":"integer","FLOAT":"double precision","BOOL":"boolean"
    }.get(tag, "text")

# ============== MAIN ==============
def main():
    xlsx = Path(FILE_PATH)
    log(f"Fichier: {xlsx}")
    if not xlsx.exists():
        log("[error] Fichier introuvable"); return

    try:
        sheets = pd.read_excel(xlsx, sheet_name=None, engine="openpyxl", header=None)
    except Exception as e:
        log(f"[error] Lecture √©chou√©e: {e}"); return

    done = 0
    for name, raw in sheets.items():
        log(f"\n[feuille] {name}: shape initiale={raw.shape}")

        # nettoyage vide
        raw = raw.dropna(how="all", axis=0).dropna(how="all", axis=1)
        log(f"[feuille] {name}: apr√®s drop vides -> {raw.shape}")
        if raw.empty: 
            log(f"[feuille] {name}: vide, on passe.")
            continue

        # header + normalisation
        h = choose_header_row(raw, scan=25)
        log(f"[feuille] {name}: header choisi √† la ligne {h}")
        cols = normalize_columns(raw.iloc[h].tolist())
        log(f"[feuille] {name}: colonnes normalis√©es -> {', '.join(cols)}")
        df = raw.iloc[h+1:].copy()
        df.columns = cols
        df = df.dropna(how="all")
        df = drop_empty_columns(df)
        log(f"[feuille] {name}: shape apr√®s nettoyage colonnes -> {df.shape}")
        if df.empty:
            log(f"[feuille] {name}: vide apr√®s normalisation, on passe.")
            continue

        # inf√©rence types
        schema = {}
        for c in df.columns:
            casted, tag = infer_col(df[c])
            df[c] = casted
            schema[c] = tag

        base = f"{snake_id(xlsx.stem)}.{snake_id(name)}"
        print(f"\n--- Feuille: {name} -> {base} ---")
        print(f"rows={len(df)} | cols={df.shape[1]}")
        print("Colonnes:", ", ".join(df.columns.astype(str)))
        print("\nSchema d√©tect√©:")
        for c in df.columns:
            print(f"  - {c}: {schema[c]}  ->  {pg_type(schema[c])}")

        if SHOW_SAMPLE:
            with pd.option_context("display.max_columns", 120, "display.width", 220):
                print("\nSample (top 10):")
                print(df.head(10))

        done += 1

    print(f"\nDone. Feuilles analys√©es: {done}")

if __name__ == "__main__":
    main()


# Projets AEC AE 2025 - source.xlsx

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import numpy as np
import re, unicodedata, warnings
from datetime import datetime

# ============== CONFIG ==============
FILE_PATH = r"C:\globasoft\aerotech\fic\Projets AEC AE 2025 - source.xlsx"
SHOW_SAMPLE = True
STRICT_SOURCE_SCHEMA = True  # Afficher uniquement le sch√©ma strict du fichier

# ============== LOG ==============
def now(): return datetime.now().strftime("%H:%M:%S")
def log(msg): print(f"[{now()}] {msg}", flush=True)

# ============== UTILS ==============
def strip_accents_lower(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def snake_id(s):
    s = strip_accents_lower(s)
    s = re.sub(r"[\s\.\-]+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "_", s)
    return re.sub(r"_+", "_", s).strip("_") or "col"

def make_unique_columns(cols):
    """Rend les noms uniques en suffixant _2, _3... en cas de doublons."""
    out, seen = [], {}
    for c in cols:
        if c not in seen:
            seen[c] = 1
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}_{seen[c]}")
    return out

def parse_number_like(series: pd.Series) -> pd.Series:
    x = series.astype("string").str.replace("\u00A0"," ", regex=False).str.replace(" ","", regex=False)
    # retire s√©parateurs de milliers type "." entre chiffres
    x = x.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))","", regex=True)
    # virgule d√©cimale -> point
    x = x.str.replace(",",".", regex=False)
    return pd.to_numeric(x, errors="coerce")

def drop_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=1, how="all")
    blank_idx = []
    for j in range(df.shape[1]):
        s = df.iloc[:, j]
        if s.dropna().astype(str).str.strip().eq("").all():
            blank_idx.append(j)
    if blank_idx:
        log(f"[drop_empty_columns] suppression colonnes blanches (pos): {blank_idx}")
        df = df.drop(df.columns[blank_idx], axis=1)
    return df

# ============== HEADER PICK ==============
HEADER_HINTS = (
    "projet","projets","code","id","r√©f","ref","libelle","libell√©","intitul√©",
    "client","fournisseur","chef de projet","cp","cdp","manager",
    "statut","status","phase","categorie","cat√©gorie","type",
    "date debut","date d√©but","debut","d√©but","date fin","fin","echeance","√©ch√©ance",
    "budget","montant","cout","co√ªt","prix","ht","ttc",
    "avancement","progress","%","taux",
    "site","section analytique","analytique","commentaire","p√©riode","periode",
    "plan","niveau","sop","ann√©e","mois","anneemois","nature","compte","cr√©dit","debit","solde"
)

def choose_header_row(df: pd.DataFrame, scan=25) -> int:
    limit = min(len(df), scan)
    cand_idx, cand_hits = None, -1
    for i in range(limit):
        row = df.iloc[i].astype(str)
        hits = 0
        for v in row:
            t = strip_accents_lower(v)
            if t and any(k in t for k in HEADER_HINTS):
                hits += 1
        if hits >= 2 and hits > cand_hits:
            cand_idx, cand_hits = i, hits
    if cand_idx is not None:
        return cand_idx

    best, idx = -1, 0
    for i in range(limit):
        row = df.iloc[i].astype(str)
        non_empty = row.map(lambda x: x.strip()!="").sum()
        texty = row.map(lambda x: bool(re.search(r"[A-Za-z√Ä-√ø]", x))).sum()
        score = non_empty*2 + texty
        if score > best: best, idx = score, i
    return idx

# ============== INF√âRENCE TYPES (‚Üí PostgreSQL) ==============
MAP_TRUE  = {"true","vrai","oui","yes","1","y","o"}
MAP_FALSE = {"false","faux","non","no","0","n"}

DATE_TOKEN_RE = re.compile(
    r"(?:\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})|(?:\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4})|"
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|janv|f√©vr|fevr|avr|mai|juin|juil|sept|oct|nov|d[√©e]c)",
    re.I
)
ISO_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$")
COLNAME_DATE_HINTS = ("date","dt","heure","time","echeance","√©ch√©ance","debut","d√©but","fin","p√©riode","periode")

AMOUNT_NAME_HINTS = (
    "montant","debit","d√©bit","credit","cr√©dit","solde","budget","ht","ttc",
    "amount","total","prix","co√ªt","cout"
)
CODE_LIKE_HINTS = ("code","nature","plan","compte","sop","section","niveau")

def colname_has(hints, name):
    n = strip_accents_lower(name or "")
    return any(h in n for h in hints)

def excel_serial_to_datetime(series: pd.Series) -> pd.Series:
    vals = pd.to_numeric(series, errors="coerce").astype("float64")
    vals[~np.isfinite(vals)] = np.nan
    # fen√™tre s√ªre Excel (‚âà 1899-12-30 + jours)
    lower, upper = 60.0, 60000.0  # upper ~ 2064
    mask = (vals >= lower) & (vals <= upper)
    n_out = int((~mask & ~pd.isna(vals)).sum())
    if n_out:
        log(f"[excel_serial_to_datetime] valeurs hors fen√™tre [{int(lower)}, {int(upper)}] -> NaN: {n_out}")
    vals = vals.where(mask, np.nan)
    base = pd.Timestamp("1899-12-30")
    td = pd.to_timedelta(vals, unit="D", errors="coerce")
    dt = base + td
    return pd.to_datetime(dt, errors="coerce")

def infer_col(col: pd.Series):
    """
    Retourne (serie_casted, tag) avec tag ‚àà {"DATE","DATETIME","INT","FLOAT","BOOL","STRING"}.
    """
    s = col.copy()
    ss = s.astype("string").str.strip()

    # 0) ISO direct
    iso_mask = ss.fillna("").str.match(ISO_DATE_RE)
    if len(ss) and iso_mask.mean() >= 0.5:
        dt_iso = pd.to_datetime(ss.where(iso_mask), errors="coerce", dayfirst=False)
        ok = dt_iso.notna().mean()
        if ok >= 0.5:
            has_time = (dt_iso.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                dt_iso.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else dt_iso.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 1) Excel serial dates (restreint)
    as_num = pd.to_numeric(ss, errors="coerce")
    if len(ss):
        safe_mask = (as_num >= 60) & (as_num <= 60000)
        safe_ratio = safe_mask.mean()
    else:
        safe_ratio = 0.0

    colname_hint = colname_has(COLNAME_DATE_HINTS, s.name)
    high_vals_ratio = (as_num >= 20000).mean() if len(ss) else 0.0  # 20000 ~ 1954

    if (as_num.notna().mean() if len(ss) else 0.0) >= 0.9 and safe_ratio >= 0.7 and (colname_hint or high_vals_ratio >= 0.2):
        parsed = excel_serial_to_datetime(as_num)
        ok = parsed.notna().mean() if len(ss) else 0.0
        if ok >= 0.7:
            has_time = (parsed.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                parsed.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else parsed.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 2) tokens de date / nom ‚Äúdatey‚Äù
    date_token_ratio = ss.fillna("").str.contains(DATE_TOKEN_RE, na=False).mean()
    looks_datey = colname_hint or (date_token_ratio >= 0.30)
    if looks_datey:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
            dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
            dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
        dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1
        ok = dt.notna().mean() if len(ss) else 0.0
        ok_thresh = 0.50 if colname_hint else 0.70
        if ok >= ok_thresh:
            has_time = (dt.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                pd.to_datetime(dt).dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else pd.to_datetime(dt).dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 3) bool strict
    mb = ss.str.lower().map(lambda x: True if x in MAP_TRUE else (False if x in MAP_FALSE else pd.NA))
    if len(ss) and mb.notna().mean() >= 0.98:
        return mb.astype("boolean"), "BOOL"

    # 3bis) colonnes mon√©taires : forcer num√©ric si le nom l'indique
    if colname_has(AMOUNT_NAME_HINTS, s.name):
        nums_hint = parse_number_like(ss)
        if nums_hint.notna().any():
            nz = nums_hint.dropna()
            if len(nz) and (np.mod(nz, 1) == 0).all():
                return nums_hint.astype("Int64"), "INT"
            return nums_hint.astype("Float64"), "FLOAT"

    # 3ter) colonnes "code-like" : garder du texte
    if colname_has(CODE_LIKE_HINTS, s.name):
        return ss.astype("string"), "STRING"

    # 4) nombre g√©n√©rique
    nums = parse_number_like(ss)
    if (nums.notna().mean() if len(ss) else 0) >= 0.85:
        nz = nums.dropna()
        if len(nz) and (np.mod(nz, 1) == 0).all():
            return nums.astype("Int64"), "INT"
        return nums.astype("Float64"), "FLOAT"

    # 5) texte
    return ss.astype("string"), "STRING"

def pg_type(tag: str) -> str:
    return {
        "DATE":"date",
        "DATETIME":"timestamp without time zone",
        "INT":"integer",
        "FLOAT":"double precision",
        "BOOL":"boolean",
        "STRING":"text"
    }.get(tag, "text")

# ============== MAIN ==============
def main():
    xlsx = Path(FILE_PATH)
    log(f"Fichier: {xlsx}")
    if not xlsx.exists():
        log("[error] Fichier introuvable"); return

    try:
        sheets = pd.read_excel(xlsx, sheet_name=None, engine="openpyxl", header=None)
    except Exception as e:
        log(f"[error] Lecture √©chou√©e: {e}"); return

    done = 0
    for name, raw in sheets.items():
        log(f"\n[feuille] {name}: shape initiale={raw.shape}")

        # nettoyage vide
        raw = raw.dropna(how="all", axis=0).dropna(how="all", axis=1)
        log(f"[feuille] {name}: apr√®s drop vides -> {raw.shape}")
        if raw.empty:
            log(f"[feuille] {name}: vide, on passe.")
            continue

        # rep√®re l'ent√™te dans le brut (sans remap)
        h = choose_header_row(raw, scan=25)
        log(f"[feuille] {name}: header choisi √† la ligne {h}")

        # ====== VUE SOURCE STRICTE (sch√©ma du fichier) ======
        df_src = raw.iloc[h+1:].copy()
        src_cols_original = raw.iloc[h].tolist()  # libell√©s EXACTS du fichier
        df_src.columns = src_cols_original
        df_src = df_src.dropna(how="all").copy()
        df_src = drop_empty_columns(df_src)
        log(f"[feuille] {name}: shape apr√®s affectation des en-t√™tes -> {df_src.shape}")
        if df_src.empty:
            log(f"[feuille] {name}: vide apr√®s normalisation, on passe.")
            continue

        # noms PostgreSQL (snake_case) correspondants aux libell√©s Excel
        pg_cols = make_unique_columns([snake_id(c) for c in df_src.columns])

        # inf√©rence STRICTE sur colonnes d'origine (sans ajout ni remap)
        schema_src = {}
        casted_df = df_src.copy()
        for c in list(df_src.columns):
            casted, tag = infer_col(df_src[c])
            casted_df[c] = casted
            schema_src[c] = tag

        # affichage STRICT : sch√©ma = fichier, avec mapping vers noms pg + types pg
        base = f"{snake_id(xlsx.stem)}.{snake_id(name)}"
        print(f"\n--- Feuille: {name} -> {base} (SCH√âMA SOURCE STRICT / PostgreSQL) ---")
        print(f"rows={len(casted_df)} | cols={casted_df.shape[1]}")

        print("\nMapping colonnes (Excel ‚Üí nom_pg) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            print(f"  - {src_name}  ->  {pg_name}")

        print("\nSchema d√©tect√© (types PostgreSQL) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            tag = schema_src.get(src_name, "STRING")
            print(f"  - {pg_name}: {pg_type(tag)}  (source: {src_name}, type d√©tect√©: {tag})")

        if SHOW_SAMPLE:
            with pd.option_context("display.max_columns", 140, "display.width", 240):
                print("\nSample (top 10) :")
                print(casted_df.head(10))

        done += 1

        # ====== (OPTIONNEL) VUE CIBLE : si tu veux une table pr√™te IFS, remets √† False STRICT_SOURCE_SCHEMA et ajoute ta logique ======
        if not STRICT_SOURCE_SCHEMA:
            pass  # tu peux r√©ins√©rer ici ta vue cible/mapping si besoin

    print(f"\nDone. Feuilles analys√©es: {done}")

if __name__ == "__main__":
    main()


# Relev√© pointages AEB.xlsx

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import numpy as np
import re, unicodedata, warnings
from datetime import datetime

# ============== CONFIG ==============
FILE_PATH = r"C:\globasoft\aerotech\fic\Relev√© pointages AEB.xlsx"  # <== adapte si besoin
SHOW_SAMPLE = True
STRICT_SOURCE_SCHEMA = True  # sch√©ma = EXACT du fichier (pas d'ajout/remap)

# ============== LOG ==============
def now(): return datetime.now().strftime("%H:%M:%S")
def log(msg): print(f"[{now()}] {msg}", flush=True)

# ============== UTILS ==============
def strip_accents_lower(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def snake_id(s):
    s = strip_accents_lower(s)
    s = re.sub(r"[\s\.\-]+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "_", s)
    return re.sub(r"_+", "_", s).strip("_") or "col"

def make_unique_columns(cols):
    out, seen = [], {}
    for c in cols:
        if c not in seen:
            seen[c] = 1
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}_{seen[c]}")
    return out

def parse_number_like(series: pd.Series) -> pd.Series:
    x = series.astype("string").str.replace("\u00A0"," ", regex=False).str.replace(" ","", regex=False)
    x = x.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))","", regex=True)  # 1.234 -> 1234 (milliers)
    x = x.str.replace(",",".", regex=False)  # virgule d√©cimale -> point
    return pd.to_numeric(x, errors="coerce")

def drop_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=1, how="all")
    blank_idx = []
    for j in range(df.shape[1]):
        s = df.iloc[:, j]
        if s.dropna().astype(str).str.strip().eq("").all():
            blank_idx.append(j)
    if blank_idx:
        log(f"[drop_empty_columns] suppression colonnes blanches (pos): {blank_idx}")
        df = df.drop(df.columns[blank_idx], axis=1)
    return df

# ============== HEADER PICK ==============
HEADER_HINTS = (
    # g√©n√©riques
    "projet","projets","code","id","r√©f","ref","libelle","libell√©","intitul√©",
    "client","fournisseur","chef de projet","cp","cdp","manager",
    "statut","status","phase","categorie","cat√©gorie","type",
    "date","heure","time","p√©riode","periode","echeance","√©ch√©ance",
    "budget","montant","cout","co√ªt","prix","ht","ttc","avancement","progress","%","taux",
    "site","section analytique","analytique","commentaire","observations","notes","remarques",
    # sp√©cifiques vus dans le fichier de pointage
    "trigramme","nom","cat pointage","n¬∞ dossier","categorie","ot","ata","type de t√¢che","commentaires",
    "bu","immat","wp","semaine","mois","annee","ann√©e","cat point","annee - semaine","mois."
)

def choose_header_row(df: pd.DataFrame, scan=25) -> int:
    limit = min(len(df), scan)
    cand_idx, cand_hits = None, -1
    for i in range(limit):
        row = df.iloc[i].astype(str)
        hits = 0
        for v in row:
            t = strip_accents_lower(v)
            if t and any(k in t for k in HEADER_HINTS):
                hits += 1
        if hits >= 2 and hits > cand_hits:
            cand_idx, cand_hits = i, hits
    if cand_idx is not None:
        return cand_idx

    best, idx = -1, 0
    for i in range(limit):
        row = df.iloc[i].astype(str)
        non_empty = row.map(lambda x: x.strip()!="").sum()
        texty = row.map(lambda x: bool(re.search(r"[A-Za-z√Ä-√ø]", x))).sum()
        score = non_empty*2 + texty
        if score > best: best, idx = score, i
    return idx

# ============== INF√âRENCE TYPES (‚Üí PostgreSQL) ==============
MAP_TRUE  = {"true","vrai","oui","yes","1","y","o"}
MAP_FALSE = {"false","faux","non","no","0","n"}

DATE_TOKEN_RE = re.compile(
    r"(?:\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})|(?:\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4})|"
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|janv|f√©vr|fevr|avr|mai|juin|juil|sept|oct|nov|d[√©e]c)",
    re.I
)
ISO_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$")
COLNAME_DATE_HINTS = ("date","dt","heure","time","echeance","√©ch√©ance","debut","d√©but","fin","p√©riode","periode","semaine","mois","ann√©e","annee")

AMOUNT_NAME_HINTS = (
    "montant","debit","d√©bit","credit","cr√©dit","solde","budget","ht","ttc",
    "amount","total","prix","co√ªt","cout","heures","h"
)
CODE_LIKE_HINTS = ("code","nature","plan","compte","sop","section","niveau","trigramme","immat","ot","ata","wp","n¬∞ dossier","n_dossier","dossier","cat point","cat pointage")

def colname_has(hints, name):
    n = strip_accents_lower(name or "")
    return any(h in n for h in hints)

def excel_serial_to_datetime(series: pd.Series) -> pd.Series:
    vals = pd.to_numeric(series, errors="coerce").astype("float64")
    vals[~np.isfinite(vals)] = np.nan
    lower, upper = 60.0, 60000.0  # ~ 1900..2064
    mask = (vals >= lower) & (vals <= upper)
    n_out = int((~mask & ~pd.isna(vals)).sum())
    if n_out:
        log(f"[excel_serial_to_datetime] valeurs hors fen√™tre [{int(lower)}, {int(upper)}] -> NaN: {n_out}")
    base = pd.Timestamp("1899-12-30")
    td = pd.to_timedelta(vals.where(mask, np.nan), unit="D", errors="coerce")
    dt = base + td
    return pd.to_datetime(dt, errors="coerce")

def infer_col(col: pd.Series):
    """
    Retourne (serie_casted, tag) avec tag ‚àà {"DATE","DATETIME","INT","FLOAT","BOOL","STRING"}.
    """
    s = col.copy()
    ss = s.astype("string").str.strip()

    # 0) ISO direct
    iso_mask = ss.fillna("").str.match(ISO_DATE_RE)
    if len(ss) and iso_mask.mean() >= 0.5:
        dt_iso = pd.to_datetime(ss.where(iso_mask), errors="coerce", dayfirst=False)
        ok = dt_iso.notna().mean()
        if ok >= 0.5:
            has_time = (dt_iso.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                dt_iso.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else dt_iso.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 1) Excel serial dates (restreint)
    as_num = pd.to_numeric(ss, errors="coerce")
    safe_mask = (as_num >= 60) & (as_num <= 60000)
    safe_ratio = safe_mask.mean() if len(ss) else 0.0

    colname_hint = colname_has(COLNAME_DATE_HINTS, s.name)
    high_vals_ratio = (as_num >= 20000).mean() if len(ss) else 0.0  # 20000 ~ 1954

    if (as_num.notna().mean() if len(ss) else 0.0) >= 0.9 and safe_ratio >= 0.7 and (colname_hint or high_vals_ratio >= 0.2):
        parsed = excel_serial_to_datetime(as_num)
        ok = parsed.notna().mean() if len(ss) else 0.0
        if ok >= 0.7:
            has_time = (parsed.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                parsed.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else parsed.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 2) tokens de date / nom ‚Äúdatey‚Äù
    date_token_ratio = ss.fillna("").str.contains(DATE_TOKEN_RE, na=False).mean()
    looks_datey = colname_hint or (date_token_ratio >= 0.30)
    if looks_datey:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
            dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
            dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
        dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1
        ok = dt.notna().mean() if len(ss) else 0.0
        ok_thresh = 0.50 if colname_hint else 0.70
        if ok >= ok_thresh:
            has_time = (dt.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                pd.to_datetime(dt).dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else pd.to_datetime(dt).dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 3) bool strict
    mb = ss.str.lower().map(lambda x: True if x in MAP_TRUE else (False if x in MAP_FALSE else pd.NA))
    if len(ss) and mb.notna().mean() >= 0.98:
        return mb.astype("boolean"), "BOOL"

    # 3bis) colonnes mon√©taires / heures : forcer num√©rique si le nom l'indique
    if colname_has(AMOUNT_NAME_HINTS, s.name):
        nums_hint = parse_number_like(ss)
        if nums_hint.notna().any():
            nz = nums_hint.dropna()
            if len(nz) and (np.mod(nz, 1) == 0).all():
                return nums_hint.astype("Int64"), "INT"
            return nums_hint.astype("Float64"), "FLOAT"

    # 3ter) colonnes "code-like" : garder du texte
    if colname_has(CODE_LIKE_HINTS, s.name):
        return ss.astype("string"), "STRING"

    # 4) nombre g√©n√©rique
    nums = parse_number_like(ss)
    if (nums.notna().mean() if len(ss) else 0) >= 0.85:
        nz = nums.dropna()
        if len(nz) and (np.mod(nz, 1) == 0).all():
            return nums.astype("Int64"), "INT"
        return nums.astype("Float64"), "FLOAT"

    # 5) texte
    return ss.astype("string"), "STRING"

def pg_type(tag: str) -> str:
    return {
        "DATE":"date",
        "DATETIME":"timestamp without time zone",
        "INT":"integer",
        "FLOAT":"double precision",
        "BOOL":"boolean",
        "STRING":"text"
    }.get(tag, "text")

# ============== MAIN ==============
def main():
    xlsx = Path(FILE_PATH)
    log(f"Fichier: {xlsx}")
    if not xlsx.exists():
        log("[error] Fichier introuvable"); return

    try:
        sheets = pd.read_excel(xlsx, sheet_name=None, engine="openpyxl", header=None)
    except Exception as e:
        log(f"[error] Lecture √©chou√©e: {e}"); return

    done = 0
    for name, raw in sheets.items():
        log(f"\n[feuille] {name}: shape initiale={raw.shape}")

        # nettoyage vide
        raw = raw.dropna(how="all", axis=0).dropna(how="all", axis=1)
        log(f"[feuille] {name}: apr√®s drop vides -> {raw.shape}")
        if raw.empty:
            log(f"[feuille] {name}: vide, on passe.")
            continue

        # rep√®re l'ent√™te dans le brut (sans remap)
        h = choose_header_row(raw, scan=25)
        log(f"[feuille] {name}: header choisi √† la ligne {h}")

        # ====== VUE SOURCE STRICTE (sch√©ma du fichier) ======
        df_src = raw.iloc[h+1:].copy()
        src_cols_original = raw.iloc[h].tolist()  # libell√©s EXACTS du fichier
        df_src.columns = src_cols_original
        df_src = df_src.dropna(how="all").copy()
        df_src = drop_empty_columns(df_src)
        log(f"[feuille] {name}: shape apr√®s affectation des en-t√™tes -> {df_src.shape}")
        if df_src.empty:
            log(f"[feuille] {name}: vide apr√®s normalisation, on passe.")
            continue

        # noms PostgreSQL (snake_case) correspondants aux libell√©s Excel
        pg_cols = make_unique_columns([snake_id(c) for c in df_src.columns])

        # inf√©rence STRICTE sur colonnes d'origine (sans ajout ni remap)
        schema_src = {}
        casted_df = df_src.copy()
        for c in list(df_src.columns):
            casted, tag = infer_col(df_src[c])
            casted_df[c] = casted
            schema_src[c] = tag

        # affichage STRICT : sch√©ma = fichier, avec mapping vers noms pg + types pg
        base = f"{snake_id(xlsx.stem)}.{snake_id(name)}"
        print(f"\n--- Feuille: {name} -> {base} (SCH√âMA SOURCE STRICT / PostgreSQL) ---")
        print(f"rows={len(casted_df)} | cols={casted_df.shape[1]}")

        print("\nMapping colonnes (Excel ‚Üí nom_pg) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            print(f"  - {src_name}  ->  {pg_name}")

        print("\nSchema d√©tect√© (types PostgreSQL) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            tag = schema_src.get(src_name, "STRING")
            print(f"  - {pg_name}: {pg_type(tag)}  (source: {src_name}, type d√©tect√©: {tag})")

        if SHOW_SAMPLE:
            with pd.option_context("display.max_columns", 180, "display.width", 260):
                print("\nSample (top 10) :")
                print(casted_df.head(10))

        done += 1

        # ====== (OPTIONNEL) VUE CIBLE : si tu veux une table cible, remets STRICT_SOURCE_SCHEMA=False et impl√©mente ici ======
        if not STRICT_SOURCE_SCHEMA:
            pass

    print(f"\nDone. Feuilles analys√©es: {done}")

if __name__ == "__main__":
    main()


# Suivi Projets AEC - source.xlsx

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pathlib import Path
import pandas as pd
import numpy as np
import re, unicodedata, warnings
from datetime import datetime

# ============== CONFIG ==============
FILE_PATH = r"C:\globasoft\aerotech\fic\Suivi Projets AEC - source.xlsx"
SHOW_SAMPLE = True

# (optionnel) masquer un warning bruyant de pandas
warnings.filterwarnings("ignore", message=r"Parsing dates in %Y-%m-%d %H:%M:%S format.*", category=UserWarning)

# ============== LOG ==============
def now(): return datetime.now().strftime("%H:%M:%S")
def log(msg): print(f"[{now()}] {msg}", flush=True)

# ============== UTILS ==============
def strip_accents_lower(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = unicodedata.normalize("NFKD", str(s))
    s = "".join(c for c in s if not unicodedata.combining(c))
    return s.lower().strip()

def snake_id(s):
    s = strip_accents_lower(s)
    s = re.sub(r"[\s\.\-]+", "_", s)
    s = re.sub(r"[^a-z0-9_]", "_", s)
    return re.sub(r"_+", "_", s).strip("_") or "col"

def make_unique_columns(cols):
    out, seen = [], {}
    for c in cols:
        if c not in seen:
            seen[c] = 1
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}_{seen[c]}")
    return out

def _preclean_numeric_strings(x: pd.Series) -> pd.Series:
    """Nettoyage avant parse : retire True/False, espaces ins√©cables, milliers, virgule d√©cimale."""
    s = x.astype("string")
    # True/False/Yes/No -> vide (on les consid√®re comme 'pas de valeur' dans colonnes num√©riques)
    s = s.str.replace(r"^(true|false|yes|no|vrai|faux)$", "", flags=re.I, regex=True)
    s = s.str.replace("\u00A0"," ", regex=False).str.replace(" ","", regex=False)
    s = s.str.replace(r"(?<=\d)\.(?=\d{3}(?:\D|$))","", regex=True)  # mille: 1.234 -> 1234
    s = s.str.replace(",",".", regex=False)  # d√©cimale: 1,23 -> 1.23
    return s

def parse_number_like(series: pd.Series) -> pd.Series:
    s = _preclean_numeric_strings(series)
    return pd.to_numeric(s, errors="coerce")

def drop_empty_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(axis=1, how="all")
    blank_idx = []
    for j in range(df.shape[1]):
        s = df.iloc[:, j]
        if s.dropna().astype(str).str.strip().eq("").all():
            blank_idx.append(j)
    if blank_idx:
        log(f"[drop_empty_columns] suppression colonnes blanches (pos): {blank_idx}")
        df = df.drop(df.columns[blank_idx], axis=1)
    return df

# ============== HEADER PICK ==============
HEADER_HINTS = (
    # g√©n√©riques
    "projet","projets","code","id","r√©f","ref","libelle","libell√©","intitul√©",
    "client","fournisseur","chef de projet","cp","cdp","manager",
    "statut","status","phase","categorie","cat√©gorie","type",
    "date","heure","time","p√©riode","periode","echeance","√©ch√©ance","fin","d√©but","debut",
    "budget","montant","cout","co√ªt","prix","ht","ttc","avancement","progress","%","taux",
    "site","section analytique","analytique","commentaire","observations","notes","remarques",
    # vus dans ton fichier
    "comments","revue raf","statut clipper","gtm","p√¥le","pole",
    "remaining forecast","facturation mois","hrs","heures","(h)",
    "ca ","charges","marge","achats","command√©","enregistr√©","fnp","int","raf"
)

def choose_header_row(df: pd.DataFrame, scan=30) -> int:
    limit = min(len(df), scan)
    cand_idx, cand_hits = None, -1
    for i in range(limit):
        row = df.iloc[i].astype(str)
        hits = 0
        for v in row:
            t = strip_accents_lower(v)
            if t and any(k in t for k in HEADER_HINTS):
                hits += 1
        if hits >= 2 and hits > cand_hits:
            cand_idx, cand_hits = i, hits
    if cand_idx is not None:
        return cand_idx

    best, idx = -1, 0
    for i in range(limit):
        row = df.iloc[i].astype(str)
        non_empty = row.map(lambda x: x.strip()!="").sum()
        texty = row.map(lambda x: bool(re.search(r"[A-Za-z√Ä-√ø]", x))).sum()
        score = non_empty*2 + texty
        if score > best: best, idx = score, i
    return idx

# ============== INF√âRENCE TYPES (‚Üí PostgreSQL) ==============
MAP_TRUE  = {"true","vrai","oui","yes","1","y","o"}
MAP_FALSE = {"false","faux","non","no","0","n"}

DATE_TOKEN_RE = re.compile(
    r"(?:\d{4}[-/\.]\d{1,2}[-/\.]\d{1,2})|(?:\d{1,2}[-/\.]\d{1,2}[-/\.]\d{2,4})|"
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|janv|f√©vr|fevr|avr|mai|juin|juil|sept|oct|nov|d[√©e]c)",
    re.I
)
ISO_DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?$")

# Cat√©gories par en-t√™te (PRIMES sur le contenu)
DATE_NAME_HINTS   = ("date","dt","heure","time","echeance","√©ch√©ance","debut","d√©but","fin","p√©riode","periode","semaine","mois","ann√©e","annee")
MONEY_NAME_HINTS  = ("‚Ç¨","ht","budget","montant","amount","total","prix","co√ªt","cout","facturation","forecast","achats","command√©","enregistr√©","fnp","ca ","charges","marge","int","raf")
HOUR_NAME_HINTS   = ("heures","hrs","(h)"," h ")
PERCENT_NAME_HINTS= ("%", "taux", "marge", "avancement", "√©cart", "ecart", "pourcent")

CODE_LIKE_HINTS   = ("code","nature","plan","compte","sop","section","niveau","trigramme","immat","ot","ata","wp","n¬∞ dossier","n_dossier","dossier","cat point","cat pointage","statut clipper","gtm","p√¥le","pole","comments","cdp","top custo","r√©union raf","reunion raf","statut suivi de projets","statut suivi de projet")

def colname_has(hints, name):
    n = strip_accents_lower(name or "")
    return any(h in n for h in hints)

def excel_serial_to_datetime(series: pd.Series) -> pd.Series:
    vals = pd.to_numeric(series, errors="coerce").astype("float64")
    vals[~np.isfinite(vals)] = np.nan
    lower, upper = 60.0, 60000.0  # ~ 1900..2064
    mask = (vals >= lower) & (vals <= upper)
    n_out = int((~mask & ~pd.isna(vals)).sum())
    if n_out:
        log(f"[excel_serial_to_datetime] valeurs hors fen√™tre [{int(lower)}, {int(upper)}] -> NaN: {n_out}")
    base = pd.Timestamp("1899-12-30")
    td = pd.to_timedelta(vals.where(mask, np.nan), unit="D", errors="coerce")
    dt = base + td
    return pd.to_datetime(dt, errors="coerce")

def infer_col(col: pd.Series):
    """
    Retourne (serie_casted, tag) avec tag ‚àà {"DATE","DATETIME","INT","FLOAT","BOOL","STRING"}.
    PR√âC√âDENCE EN-T√äTE > CONTENU pour √©viter les faux types.
    """
    s = col.copy()
    ss = s.astype("string").str.strip()
    name = s.name or ""

    # 1) Colonnes ‚Ç¨ / CA / Charges / Marge / Facturation / Achats / RAF / Int  -> NUM√âRIQUE
    if colname_has(MONEY_NAME_HINTS, name):
        nums = parse_number_like(ss)
        if nums.notna().any():
            nz = nums.dropna()
            # mon√©taire et m√©triques -> float par d√©faut
            return nums.astype("Float64"), "FLOAT"

    # 2) Colonnes Heures -> NUM√âRIQUE (float)
    if colname_has(HOUR_NAME_HINTS, name):
        nums = parse_number_like(ss)
        if nums.notna().any():
            return nums.astype("Float64"), "FLOAT"

    # 3) Colonnes % / taux / avancement / √©cart -> NUM√âRIQUE (float)
    if ("%" in (name or "")) or colname_has(PERCENT_NAME_HINTS, name):
        nums = parse_number_like(ss)
        if nums.notna().any():
            return nums.astype("Float64"), "FLOAT"

    # 4) Dates ISO explicites
    iso_mask = ss.fillna("").str.match(ISO_DATE_RE)
    if len(ss) and iso_mask.mean() >= 0.5:
        dt_iso = pd.to_datetime(ss.where(iso_mask), errors="coerce", dayfirst=False)
        ok = dt_iso.notna().mean()
        if ok >= 0.5:
            has_time = (dt_iso.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                dt_iso.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else dt_iso.dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 5) Excel-serial -> Date : UNIQUEMENT si le nom √©voque une date/heure
    if colname_has(DATE_NAME_HINTS, name):
        as_num = pd.to_numeric(ss, errors="coerce")
        safe_mask = (as_num >= 60) & (as_num <= 60000)
        if (as_num.notna().mean() if len(ss) else 0.0) >= 0.9 and safe_mask.mean() >= 0.7:
            parsed = excel_serial_to_datetime(as_num)
            ok = parsed.notna().mean() if len(ss) else 0.0
            if ok >= 0.7:
                has_time = (parsed.dt.time.astype(str) != "00:00:00").mean() > 0.2
                return (
                    parsed.dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else parsed.dt.strftime("%Y-%m-%d"),
                    "DATETIME" if has_time else "DATE"
                )

    # 6) Tokens de date
    date_token_ratio = ss.fillna("").str.contains(DATE_TOKEN_RE, na=False).mean()
    if colname_has(DATE_NAME_HINTS, name) or (date_token_ratio >= 0.30):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="Could not infer format.*", category=UserWarning)
            dt1 = pd.to_datetime(ss, errors="coerce", dayfirst=True,  cache=True)
            dt2 = pd.to_datetime(ss, errors="coerce", dayfirst=False, cache=True)
        dt = dt2 if dt2.notna().sum() > dt1.notna().sum() else dt1
        ok = dt.notna().mean() if len(ss) else 0.0
        ok_thresh = 0.50 if colname_has(DATE_NAME_HINTS, name) else 0.70
        if ok >= ok_thresh:
            has_time = (dt.dt.time.astype(str) != "00:00:00").mean() > 0.2
            return (
                pd.to_datetime(dt).dt.strftime("%Y-%m-%d %H:%M:%S") if has_time else pd.to_datetime(dt).dt.strftime("%Y-%m-%d"),
                "DATETIME" if has_time else "DATE"
            )

    # 7) Bool strict (apr√®s les overrides ci-dessus)
    mb = ss.str.lower().map(lambda x: True if x in MAP_TRUE else (False if x in MAP_FALSE else pd.NA))
    if len(ss) and mb.notna().mean() >= 0.98:
        return mb.astype("boolean"), "BOOL"

    # 8) Nombre g√©n√©rique
    nums = parse_number_like(ss)
    if (nums.notna().mean() if len(ss) else 0) >= 0.85:
        nz = nums.dropna()
        if len(nz) and (np.mod(nz, 1) == 0).all():
            return nums.astype("Int64"), "INT"
        return nums.astype("Float64"), "FLOAT"

    # 9) Texte (codes/labels)
    if colname_has(CODE_LIKE_HINTS, name):
        return ss.astype("string"), "STRING"
    return ss.astype("string"), "STRING"

def pg_type(tag: str) -> str:
    return {
        "DATE":"date",
        "DATETIME":"timestamp without time zone",
        "INT":"integer",
        "FLOAT":"double precision",
        "BOOL":"boolean",
        "STRING":"text"
    }.get(tag, "text")

# ============== MAIN ==============
def main():
    xlsx = Path(FILE_PATH)
    log(f"Fichier: {xlsx}")
    if not xlsx.exists():
        log("[error] Fichier introuvable"); return

    try:
        sheets = pd.read_excel(xlsx, sheet_name=None, engine="openpyxl", header=None)
    except Exception as e:
        log(f"[error] Lecture √©chou√©e: {e}"); return

    done = 0
    for name, raw in sheets.items():
        log(f"\n[feuille] {name}: shape initiale={raw.shape}")

        raw = raw.dropna(how="all", axis=0).dropna(how="all", axis=1)
        log(f"[feuille] {name}: apr√®s drop vides -> {raw.shape}")
        if raw.empty:
            log(f"[feuille] {name}: vide, on passe.")
            continue

        h = choose_header_row(raw, scan=30)
        log(f"[feuille] {name}: header choisi √† la ligne {h}")

        df_src = raw.iloc[h+1:].copy()
        src_cols_original = raw.iloc[h].tolist()
        df_src.columns = src_cols_original
        df_src = df_src.dropna(how="all").copy()
        df_src = drop_empty_columns(df_src)
        log(f"[feuille] {name}: shape apr√®s affectation des en-t√™tes -> {df_src.shape}")
        if df_src.empty:
            log(f"[feuille] {name}: vide apr√®s normalisation, on passe.")
            continue

        pg_cols = make_unique_columns([snake_id(c) for c in df_src.columns])

        schema_src = {}
        casted_df = df_src.copy()
        for c in list(df_src.columns):
            casted, tag = infer_col(df_src[c])
            casted_df[c] = casted
            schema_src[c] = tag

        base = f"{snake_id(xlsx.stem)}.{snake_id(name)}"
        print(f"\n--- Feuille: {name} -> {base} (SCH√âMA SOURCE STRICT / PostgreSQL) ---")
        print(f"rows={len(casted_df)} | cols={casted_df.shape[1]}")

        print("\nMapping colonnes (Excel ‚Üí nom_pg) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            print(f"  - {src_name}  ->  {pg_name}")

        print("\nSchema d√©tect√© (types PostgreSQL) :")
        for src_name, pg_name in zip(df_src.columns, pg_cols):
            tag = schema_src.get(src_name, "STRING")
            print(f"  - {pg_name}: {pg_type(tag)}  (source: {src_name}, type d√©tect√©: {tag})")

        if SHOW_SAMPLE:
            with pd.option_context("display.max_columns", 220, "display.width", 260):
                print("\nSample (top 10) :")
                print(casted_df.head(10))

        done += 1

    print(f"\nDone. Feuilles analys√©es: {done}")

if __name__ == "__main__":
    main()
