# Veltis Raw Data Exploration (2023)

This notebook loads and previews the raw data files ingested from external sources.

In [1]:
# Import Libraries
import pandas as pd
from pathlib import Path
import os

# Configuration
YEAR = 2023
print("Libraries loaded.")

Libraries loaded.


In [2]:
# Setup Paths
project_root = Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent

raw_year_dir = project_root / 'data' / 'raw' / str(YEAR)
print(f"Raw {YEAR} Data Path: {raw_year_dir}")


Raw 2023 Data Path: /workspaces/MVP-web-scrapping-project/data/raw/2023


In [3]:
# List and Preview Files
if not raw_year_dir.exists():
    print("Directory not found! Please run ingestion first.")
else:
    files = list(raw_year_dir.glob('*'))
    print(f"Found {len(files)} files:")
    for f in files:
        print(f"- {f.name}")

    print("\n--- PREVIEWS ---")
    for f in files:
        if f.suffix == '.csv':
            print(f"\nPreviewing {f.name}:")
            try:
                # Specific handling for known tricky files
                if 'finess' in f.name.lower():
                    # FINESS raw data usually has NO header row, just a metadata line at line 0
                    # We skip line 0 (metadata) and read the rest as data with NO header
                    df = pd.read_csv(f, sep=';', encoding='latin-1', header=None, skiprows=1, nrows=5, on_bad_lines='skip')
                    
                    # Optional: Rename a few key columns for clarity if we know the schema
                    # Based on inspection: 1=Finess_ET, 4=RaisonSociale, 22=Siret
                    # But raw exploration often just shows raw structure.
                    print("(Note: File has no headers. showing generic indices)")
                    
                else:
                    # Generic attempt
                    df = pd.read_csv(f, sep=None, engine='python', nrows=5)
                
                display(df.head())
            except Exception as e:
                print(f"Error reading {f.name}: {e}")
        elif f.suffix in ['.xls', '.xlsx']:
            print(f"\nPreviewing {f.name}:")
            try:
                df = pd.read_excel(f, nrows=5)
                display(df.head())
            except Exception as e:
                print(f"Error reading {f.name}: {e}")


Found 4 files:
- has_etab_geo.csv
- finess.csv
- health_metrics.xlsx
- has_demarche.csv

--- PREVIEWS ---

Previewing has_etab_geo.csv:


Unnamed: 0,"﻿""code_demarche""",FINESS_EJ,FINESS_EG,RS_eg,Site_Principal
0,30001,350000402,350002176,CLINIQUE DE L'ESPERANCE,True
1,30002,340000272,340024314,CLINIQUE SAINT JEAN SUD DE FRANCE,True
2,30003,350002291,350000410,CENTRE HOSPITALIER DE JANZE,True
3,30004,530000249,530000124,CLINIQUE NOTRE DAME DE PRITZ,True
4,30004,530000249,530010438,CENTRE MEDIPSY - CLINIQUE NOTRE DAME DE PRITZ,False



Previewing finess.csv:
(Note: File has no headers. showing generic indices)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,structureet,10000024,10780054,CH DE FLEYRIAT,CENTRE HOSPITALIER DE BOURG-EN-BRESSE FLEYRIAT,,,900,RTE,DE PARIS,...,26010004500012,8610Z,3,ARS Ã©tablissements Publics de santÃ© dotation...,1,Etablissement public de santÃ©,1979-02-13,1979-02-13,2020-02-04,
1,structureet,10000032,10780062,CH BUGEY SUD,CENTRE HOSPITALIER BUGEY SUD,,,700,AV,DE NARVIK,...,26010003700068,8610Z,3,ARS Ã©tablissements Publics de santÃ© dotation...,1,Etablissement public de santÃ©,1901-01-01,1901-01-01,2021-07-07,
2,structureet,10000065,10780096,CH DE TREVOUX - MONTPENSIER,CENTRE HOSPITALIER DE TREVOUX - MONTPENSIER,,,14,R,DE L'HOPITAL,...,26010028400017,8610Z,3,ARS Ã©tablissements Publics de santÃ© dotation...,1,Etablissement public de santÃ©,1901-01-01,1901-01-01,2018-01-12,
3,structureet,10000081,10780112,CH DU PAYS DE GEX,CENTRE HOSPITALIER DU PAYS DE GEX,,,160,R,MARC PANISSOD,...,26010010200011,8610Z,3,ARS Ã©tablissements Publics de santÃ© dotation...,1,Etablissement public de santÃ©,1901-01-01,1901-01-01,2020-02-04,
4,structureet,10000099,10780120,CH DE MEXIMIEUX,CENTRE HOSPITALIER DE MEXIMIEUX,,,13,AV,DU DOCTEUR BOYER,...,26010013600019,8610Z,3,ARS Ã©tablissements Publics de santÃ© dotation...,1,Etablissement public de santÃ©,1945-01-01,1945-01-01,2020-06-30,



Previewing health_metrics.xlsx:


Unnamed: 0,finess,rs_finess,finess_geo,rs_finess_geo,region,type,participation,Depot,nb_rep_score_ALL_ssr_ajust,score_ALL_ssr_ajust,...,nb_rep_score_LIEU_ssr_ajust,score_REPAS_ssr_ajust,nb_rep_score_REPAS_ssr_ajust,score_SORTIE_ssr_ajust,nb_rep_score_SORTIE_ssr_ajust,score_ALL_ssr_ajust_dp,taux_reco_brut,nb_reco_brut,SCORE_AJUST_ESATIS_REGION,SCORE_AJUST_ESATIS_TYPE
0,10780062,CH DOCTEUR RECAMIER,10000032,CH BUGEY SUD,Auvergne-Rhône-Alpes,Centre Hospitaliers,2- Facultatif,1- Oui,,,...,,,,,,,,,,
1,10007987,CH PUBLIC HAUTEVILLE,10000180,CH PUBLIC HAUTEVILLE - UNITE ESPERANCE,Auvergne-Rhône-Alpes,Centre Hospitaliers,2- Facultatif,1- Oui,80.0,80.76,...,80.0,79.18,80.0,75.42,80.0,81.0,80.0,80.0,76.02,77.89
2,10007987,CH PUBLIC HAUTEVILLE,10000198,CH PUBLIC HAUTEVILLE - UNITE INTERDEPT,Auvergne-Rhône-Alpes,Centre Hospitaliers,1- Obligatoire,1- Oui,188.0,78.94,...,188.0,69.73,188.0,73.1,188.0,79.0,80.9,188.0,76.02,77.89
3,10007987,CH PUBLIC HAUTEVILLE,10000214,CH PUBLIC HAUTEVILLE - UNITE ALBARINE,Auvergne-Rhône-Alpes,Centre Hospitaliers,1- Obligatoire,1- Oui,62.0,80.11,...,62.0,76.28,62.0,67.45,62.0,80.0,82.3,62.0,76.02,77.89
4,10002129,CR LES ARBELLES,10002129,SSR LES ARBELLES,Auvergne-Rhône-Alpes,Etablissements de santé privés à but lucratif,1- Obligatoire,1- Oui,85.0,70.85,...,85.0,58.03,85.0,63.17,85.0,71.0,33.7,83.0,76.02,73.2



Previewing has_demarche.csv:


Unnamed: 0,"﻿""code_demarche""",annee_visite,mois_visite,date_deb_visite,date_de_decision,Decision_de_la_CCES
0,30001,2021,09-Septembre,21/09/2021,10/02/2022,Certifié
1,30002,2023,01-Janvier,24/01/2023,08/03/2023,Certifié avec mention
2,30003,2022,01-Janvier,19/01/2022,31/03/2022,Certifié
3,30004,2021,09-Septembre,28/09/2021,14/12/2021,Certifié
4,30005,2021,11-Novembre,29/11/2021,31/03/2022,Certifié
