In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Dépendances : pip install pandas openpyxl
# (optionnel pour meilleure détection d'encodage CSV : pip install charset-normalizer)
# Usage : placer vos fichiers dans le dossier "fic", puis lancer :
#   python read_simple.py

from pathlib import Path
import pandas as pd

ROOT_DIR = Path("fic")  # dossier racine à parcourir

# ---------------- Utils header ---------------- #

def looks_like_header(first_row_values):
    """Vérifie si la 1ère ligne ressemble à une ligne d'en-tête."""
    vals = [(str(v).strip() if pd.notna(v) else "") for v in first_row_values]
    if not any(vals):
        return False
    unnamed = sum(v.lower().startswith("unnamed") for v in vals)
    empties = sum(v in ("", "none", "nan") for v in vals)
    return (unnamed + empties) < max(1, len(vals)//3)

# ---------------- Excel ---------------- #

def read_excel_simple(path: Path):
    """Lit le 1er onglet d'un Excel, tente d'utiliser la 1ère ligne comme header."""
    probe = pd.read_excel(path, nrows=1, header=None, engine="openpyxl")
    if probe.empty or not looks_like_header(probe.iloc[0].tolist()):
        df = pd.read_excel(path, header=None, engine="openpyxl")
        df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
        used_header = False
    else:
        df = pd.read_excel(path, engine="openpyxl")
        used_header = True
    return df.convert_dtypes(), used_header

# ---------------- CSV (avec encodage robuste) ---------------- #

def pick_encoding(path: Path):
    """
    Essaie de détecter l'encodage avec charset-normalizer si installé.
    Retourne l'encodage détecté (str) ou None.
    """
    try:
        from charset_normalizer import from_path
        res = from_path(str(path)).best()
        return res.encoding if res else None
    except Exception:
        return None

def read_csv_simple(path: Path):
    """
    Lit un CSV (auto-détection du séparateur) avec gestion d'encodage:
    - devine l'encodage si possible (charset-normalizer),
    - sinon essaie: utf-8-sig, utf-8, cp1252 (ANSI), latin-1, iso-8859-1,
    - dernier recours: latin-1 avec encoding_errors='replace'.
    Détecte aussi la 1ère ligne en-tête.
    """
    enc_guess = pick_encoding(path)
    candidates = [e for e in [enc_guess, "utf-8-sig", "utf-8", "cp1252", "latin-1", "iso-8859-1"] if e]

    last_err = None
    for enc in candidates:
        try:
            probe = pd.read_csv(path, nrows=1, header=None, sep=None, engine="python", encoding=enc)
            if probe.empty or not looks_like_header(probe.iloc[0].tolist()):
                df = pd.read_csv(path, header=None, sep=None, engine="python", encoding=enc)
                df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
                used_header = False
            else:
                df = pd.read_csv(path, sep=None, engine="python", encoding=enc)
                used_header = True
            return df.convert_dtypes(), used_header
        except Exception as e:
            last_err = e
            continue

    # Dernier recours: lecture permissive (ne plante pas, remplace caractères illisibles)
    try:
        df = pd.read_csv(path, sep=None, engine="python", encoding="latin-1", encoding_errors="replace")
        probe = pd.read_csv(path, nrows=1, header=None, sep=None, engine="python", encoding="latin-1", encoding_errors="replace")
        if probe.empty or not looks_like_header(probe.iloc[0].tolist()):
            if list(df.columns) == list(range(df.shape[1])):  # colonnes 0..n (pas de header)
                df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
            used_header = False
        else:
            used_header = True
        return df.convert_dtypes(), used_header
    except Exception as e:
        raise RuntimeError(f"Echec lecture CSV (encodage). Dernière erreur: {last_err or e}")

# ---------------- Router ---------------- #

def read_table(path: Path):
    """Route vers la bonne lecture selon l'extension."""
    ext = path.suffix.lower()
    if ext in (".xlsx", ".xlsm"):
        return read_excel_simple(path)
    elif ext == ".csv":
        return read_csv_simple(path)
    else:
        raise ValueError(f"Extension non gérée: {ext}")

# ---------------- Main ---------------- #

def main():
    if not ROOT_DIR.exists():
        print(f"[error] Dossier introuvable : {ROOT_DIR.resolve()}")
        return

    # recherche récursive des fichiers tabulaires
    files = []
    for pat in ("*.xlsx", "*.xlsm", "*.csv"):
        files += [p for p in ROOT_DIR.rglob(pat) if p.is_file() and not p.name.startswith("~$")]

    print("=== data header: folder info ===")
    print(f"path: {ROOT_DIR.resolve()}")
    print(f"files_found: {len(files)}")

    if not files:
        print("[warn] Aucun fichier .xlsx/.xlsm/.csv trouvé.")
        return

    for f in sorted(files):
        print("\n=== data header: file info ===")
        print(f"path: {f.resolve()}")

        try:
            df, used = read_table(f)
        except Exception as e:
            print(f"[error] Lecture échouée pour {f.name}: {e}")
            continue

        # affichage simple
        print("\n=== data header: lecture ===")
        print(f"file: {f.name}")
        print(f"header_detected: {'yes' if used else 'no'}")
        print(f"rows: {len(df)} | cols: {df.shape[1]}")
        print("columns:", ", ".join(map(str, df.columns.tolist())))

        print("\n=== data sample (top 5) ===")
        with pd.option_context("display.max_columns", 50, "display.width", 140):
            print(df.head(5))

        print("\n=== data dtypes ===")
        print(df.dtypes)

        print("\n=== data path ===")
        print(f.resolve())

    print("\n=== done ===")

if __name__ == "__main__":
    main()


=== data header: folder info ===
path: C:\globasoft\aerotech\fic
files_found: 1

=== data header: file info ===
path: C:\globasoft\aerotech\fic\fichier qualité des trigrammes.xlsx

=== data header: lecture ===
file: fichier qualité des trigrammes.xlsx
header_detected: no
rows: 29 | cols: 9
columns: col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8, col_9

=== data sample (top 5) ===
                          col_1 col_2 col_3                                              col_4  col_5  col_6  col_7  col_8 col_9
0  Aérodrome - 81300 GRAULHET\n   NaN  <NA>  DA-SQ-0027 LISTE DES TRIGRAMMES ET MARQUES DE ...   <NA>   <NA>   <NA>   <NA>  <NA>
1                           NaN   NaN  <NA>                                               <NA>   <NA>   <NA>   <NA>   <NA>  <NA>
2                           NaN   NaN  <NA>                                               <NA>   <NA>   <NA>   <NA>   <NA>  <NA>
3                           NaN   NaN  <NA>                         EDITION : 21 du 16/05/202