# Veltis Processed Data Exploration

This notebook loads and explores the **cleaned and processed** data (Etablissements, Qualifications, Health Metrics).

In [None]:
# Set up data directory
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent

processed_dir = project_root / 'data' / 'processed' / str(YEAR)
print(f"Processed {YEAR} data path: {processed_dir}\n")

# Check if directory exists
if not processed_dir.exists():
    print(f"⚠ Warning: Directory {processed_dir} does not exist. Please run processing script first.")
else:
    print("Files in processed directory:")
    for f in processed_dir.glob('*'):
        print(f"- {f.name}")

    # Load Processed Files
    try:
        df_etab = pd.read_csv(processed_dir / 'etablissements.csv')
        print(f"✓ Loaded Etablissements: {df_etab.shape}")
        display(df_etab.head(3))
    except Exception as e: print(f"Could not load etablissements.csv: {e}")

    try:
        df_qual = pd.read_csv(processed_dir / 'qualifications.csv')
        print(f"✓ Loaded Qualifications: {df_qual.shape}")
        display(df_qual.head(3))
    except Exception as e: print(f"Could not load qualifications.csv: {e}")

    try:
        df_metrics = pd.read_csv(processed_dir / 'health_metrics.csv')
        print(f"✓ Loaded Health Metrics: {df_metrics.shape}")
        display(df_metrics.head(3))
    except Exception as e: print(f"Could not load health_metrics.csv: {e}")


In [None]:
# Set up data directory
from pathlib import Path
import pandas as pd

raw_2023_dir = Path.cwd() / "data" / "raw" / "2023"
print(f"Raw 2023 data path: {raw_2023_dir}")

# List and preview files
raw_files = list(raw_2023_dir.glob("*"))
print("Files in raw/2023:")
for f in raw_files:
    print(f"- {f.name}")

for f in raw_files:
    print(f"\n{'='*40}\n{f.name}")
    if f.suffix == ".csv":
        try:
            df = pd.read_csv(f, encoding="utf-8", engine="python", on_bad_lines="skip")
            display(df.head(5))
        except Exception as e:
            print(f"Could not read {f.name}: {e}")
    elif f.suffix in [".xls", ".xlsx"]:
        try:
            df = pd.read_excel(f)
            display(df.head(5))
        except Exception as e:
            print(f"Could not read {f.name}: {e}")
    else:
        print("(Not a CSV or Excel file, skipping)")