# Veltis Processed Data Exploration (2023)

This notebook explores the cleaned and processed data in `data/processed/2023`.


In [None]:
# Import Libraries
import pandas as pd
from pathlib import Path
import os

# Configuration
YEAR = 2023
pd.set_option('display.max_columns', None)
print("Libraries loaded.")

In [None]:
# Setup Paths
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent

processed_dir = project_root / 'data' / 'processed' / str(YEAR)
print(f"Processed {YEAR} Data Path: {processed_dir}")

In [None]:
# 1. Load Etablissements
etab_path = processed_dir / 'etablissements.csv'
if etab_path.exists():
    print(f"Loading {etab_path.name}...")
    df_etab = pd.read_csv(etab_path, low_memory=False)
    print(f"Shape: {df_etab.shape}")
    display(df_etab.head())
else:
    print(f"File not found: {etab_path}")

In [None]:
# 2. Load Qualifications
# This table contains HAS certification data.
# Note: Health Metric scores are NOT included here (see Health Metrics below).
qual_path = processed_dir / 'qualifications.csv'
if qual_path.exists():
    print(f"Loading {qual_path.name}...")
    df_qual = pd.read_csv(qual_path)
    print(f"Shape: {df_qual.shape}")
    
    # Verify columns
    print("\nColumns:", list(df_qual.columns))
    
    # Check url_rapport presence
    if 'url_rapport' in df_qual.columns:
        print("\nSample Report URLs:")
        print(df_qual['url_rapport'].dropna().head().values)
        
    display(df_qual.head())
else:
    print(f"File not found: {qual_path}")

In [None]:
# 3. Load Health Metrics
# This table contains IQSS quality indicators.
# Values should be clean (e.g., 'Oui' instead of '1- Oui').
metrics_path = processed_dir / 'health_metrics.csv'
if metrics_path.exists():
    print(f"Loading {metrics_path.name}...")
    df_metrics = pd.read_csv(metrics_path)
    print(f"Shape: {df_metrics.shape}")
    
    # Verify clean categorical values
    for col in ['participation', 'depot', 'evolution']:
        if col in df_metrics.columns:
            print(f"\nValue Counts for {col}:")
            print(df_metrics[col].value_counts().head())

    display(df_metrics.head())
else:
    print(f"File not found: {metrics_path}")

In [None]:
# 4. Analysis: Merging Data
# Example: Creating a unified view of Qualifications and Health Metrics
# We can join on 'vel_id' (internal ID) or 'finess_et' (if we link back).

if 'df_qual' in locals() and 'df_metrics' in locals() and 'df_etab' in locals():
    print("Creating unified analysis view...")
    
    # Start with Etablissements to get names
    df_analysis = df_etab[['vel_id', 'finess_et', 'raison_sociale', 'departement']].copy()
    
    # Join Qualifications
    # Note: score_satisfaction is no longer in Qualifications
    df_analysis = df_analysis.merge(df_qual[['vel_id', 'niveau_certification', 'url_rapport']], 
                                    on='vel_id', how='left')
    
    # Join Metrics
    df_analysis = df_analysis.merge(df_metrics[['vel_id', 'score_all_ssr_ajust', 'participation']], 
                                    on='vel_id', how='left')
    
    print(f"Analysis Shape: {df_analysis.shape}")
    display(df_analysis[df_analysis['score_all_ssr_ajust'].notna()].head())
else:
    print("Dataframes not loaded.")