# GOLD LAYER - Star Schema (Version qui MARCHE)

**Mode LOCAL** : Pas de problème Python version cluster

**Auteurs** : Nejma MOUALHI | Brieuc OLIVIERI | Nicolas TAING

In [1]:
# Imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
import time

print("Imports OK")

Imports OK


In [2]:
# Spark en mode LOCAL (pas de conflit Python!)
spark = SparkSession.builder \
    .appName("CHU_Gold_Star_Schema") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print(f"Spark {spark.version} started in LOCAL mode")

SILVER_BASE = "/home/jovyan/data/silver"
GOLD_OUTPUT = "/home/jovyan/data/gold"

print(f"Source: {SILVER_BASE}")
print(f"Destination: {GOLD_OUTPUT}")

Spark 3.5.0 started in LOCAL mode
Source: /home/jovyan/data/silver
Destination: /home/jovyan/data/gold


In [3]:
# Nettoyage automatique des permissions Gold (evite les erreurs)
import os
import shutil

GOLD_OUTPUT = "/home/jovyan/data/gold"

# Supprimer les tables Gold avec mauvaises permissions
if os.path.exists(GOLD_OUTPUT):
    for table in os.listdir(GOLD_OUTPUT):
        table_path = os.path.join(GOLD_OUTPUT, table)
        try:
            shutil.rmtree(table_path)
            print(f"Cleaned: {table}")
        except PermissionError:
            print(f"Permission denied on {table} - will try to overwrite")
        except Exception as e:
            print(f"Error cleaning {table}: {e}")
else:
    os.makedirs(GOLD_OUTPUT, exist_ok=True)
    print(f"Created: {GOLD_OUTPUT}")

print("Gold directory ready")

Cleaned: dim_diagnostic
Cleaned: dim_etablissement
Cleaned: dim_patient
Cleaned: dim_professionnel
Cleaned: dim_temps
Cleaned: fait_consultation
Cleaned: fait_deces
Cleaned: fait_hospitalisation
Cleaned: fait_satisfaction
Gold directory ready


## DIMENSIONS

In [4]:
# 1. DIM_TEMPS
print("="*80)
print("DIMENSION: dim_temps")
print("="*80)

dates = []
current = datetime(2013, 1, 1)
end = datetime(2025, 12, 31)

while current <= end:
    dates.append((
        current.strftime("%Y%m%d"),
        current,
        current.year,
        current.month,
        (current.month - 1) // 3 + 1,
        current.strftime("%A"),
        current.strftime("%B"),
        current.weekday() >= 5,
        current.weekday()
    ))
    current += timedelta(days=1)

schema_temps = StructType([
    StructField("id_temps", StringType(), False),
    StructField("date_complete", DateType(), False),
    StructField("annee", IntegerType(), False),
    StructField("mois", IntegerType(), False),
    StructField("trimestre", IntegerType(), False),
    StructField("jour_semaine", StringType(), True),
    StructField("nom_mois", StringType(), True),
    StructField("est_weekend", BooleanType(), True),
    StructField("numero_jour_semaine", IntegerType(), True)
])

dim_temps = spark.createDataFrame(dates, schema=schema_temps)
print(f"{dim_temps.count():,} days created")
dim_temps.show(5)

dim_temps.write.mode("overwrite").parquet(f"{GOLD_OUTPUT}/dim_temps")
print(f"Saved: {GOLD_OUTPUT}/dim_temps")

DIMENSION: dim_temps
4,748 days created
+--------+-------------+-----+----+---------+------------+--------+-----------+-------------------+
|id_temps|date_complete|annee|mois|trimestre|jour_semaine|nom_mois|est_weekend|numero_jour_semaine|
+--------+-------------+-----+----+---------+------------+--------+-----------+-------------------+
|20130101|   2013-01-01| 2013|   1|        1|     Tuesday| January|      false|                  1|
|20130102|   2013-01-02| 2013|   1|        1|   Wednesday| January|      false|                  2|
|20130103|   2013-01-03| 2013|   1|        1|    Thursday| January|      false|                  3|
|20130104|   2013-01-04| 2013|   1|        1|      Friday| January|      false|                  4|
|20130105|   2013-01-05| 2013|   1|        1|    Saturday| January|       true|                  5|
+--------+-------------+-----+----+---------+------------+--------+-----------+-------------------+
only showing top 5 rows

Saved: /home/jovyan/data/gold/dim_t

In [4]:
# 2. DIM_PATIENT
print("\n" + "="*80)
print("DIMENSION: dim_patient")
print("="*80)

df = spark.read.parquet(f"{SILVER_BASE}/patient")
dim_patient = df.select(
    F.col("id_patient"),
    F.col("nom_hash"),
    F.col("prenom_hash"),
    F.col("sexe"),
    F.col("age"),
    F.col("date_naissance"),
    F.col("ville"),
    F.col("code_postal"),
    F.col("pays"),
    F.col("groupe_sanguin")
)

print(f"{dim_patient.count():,} patients")
dim_patient.show(5, truncate=False)

dim_patient.write.mode("overwrite").parquet(f"{GOLD_OUTPUT}/dim_patient")
print(f"Saved: {GOLD_OUTPUT}/dim_patient")


DIMENSION: dim_patient
100,000 patients
+----------+----------------------------------------------------------------+----------------------------------------------------------------+------+---+--------------+--------------------+-----------+----+--------------+
|id_patient|nom_hash                                                        |prenom_hash                                                     |sexe  |age|date_naissance|ville               |code_postal|pays|groupe_sanguin|
+----------+----------------------------------------------------------------+----------------------------------------------------------------+------+---+--------------+--------------------+-----------+----+--------------+
|5         |d34db878e3724987c7853288b3744ddba9620358185f8b65822014cd45a855ab|d888938a8847ae08380fefffd5d72ae0e3483ed5d312f06b4b259817d15d75d0|male  |55 |1965-10-04    |LES LILAS           |93260      |FR  |O+            |
|15        |7c4732648c9b5c5c5e83cc2b6d2f7e628d369593e45e37caf95d3f66208

In [None]:
# 3. DIM_DIAGNOSTIC
print("\n" + "="*80)
print("DIMENSION: dim_diagnostic")
print("="*80)

df = spark.read.parquet(f"{SILVER_BASE}/diagnostic")
dim_diagnostic = df.select(
    F.col("Code_diag").alias("code_diag"),
    F.col("Diagnostic").alias("libelle"),
    F.col("Code_diag").substr(1, 1).alias("categorie")
).dropDuplicates(["code_diag"])

print(f"{dim_diagnostic.count():,} diagnostics")
dim_diagnostic.show(5, truncate=False)

dim_diagnostic.write.mode("overwrite").parquet(f"{GOLD_OUTPUT}/dim_diagnostic")
print(f"Saved: {GOLD_OUTPUT}/dim_diagnostic")

In [6]:
# 4. DIM_PROFESSIONNEL
print("\n" + "="*80)
print("DIMENSION: dim_professionnel")
print("="*80)

df_prof = spark.read.parquet(f"{SILVER_BASE}/professionnel_de_sante")
df_spec = spark.read.parquet(f"{SILVER_BASE}/specialites")

dim_professionnel = df_prof.select(
    F.col("Identifiant").alias("id_prof"),
    F.col("Nom").alias("nom"),
    F.col("Prenom").alias("prenom"),
    F.col("Code_specialite").alias("code_specialite")
).dropDuplicates(["id_prof"]).join(
    df_spec.select(
        F.col("Code_specialite"),
        F.col("Specialite").alias("nom_specialite")
    ),
    on="code_specialite",
    how="left"
)

print(f"{dim_professionnel.count():,} professionnels")
dim_professionnel.show(5, truncate=False)

dim_professionnel.write.mode("overwrite").parquet(f"{GOLD_OUTPUT}/dim_professionnel")
print(f"Saved: {GOLD_OUTPUT}/dim_professionnel")


DIMENSION: dim_professionnel
1,048,575 professionnels
+---------------+---------+----------+-------+---------------------------+
|code_specialite|id_prof  |nom       |prenom |nom_specialite             |
+---------------+---------+----------+-------+---------------------------+
|ASS890091      |01A003753|CHAREYRE  |Laure  |Assistant de service social|
|ASS890091      |01A004124|CECCARELLI|Chantal|Assistant de service social|
|ASS890091      |01A004595|GREVOT    |Nicole |Assistant de service social|
|ASS890091      |01A004611|CONSTANS  |Josiane|Assistant de service social|
|ASS890091      |01A005105|MULLER    |Sylvie |Assistant de service social|
+---------------+---------+----------+-------+---------------------------+
only showing top 5 rows

Saved: /home/jovyan/data/gold/dim_professionnel


In [7]:
# 5. DIM_ETABLISSEMENT
print("\n" + "="*80)
print("DIMENSION: dim_etablissement")
print("="*80)

df_etab = spark.read.parquet(f"{SILVER_BASE}/etablissement_sante")
df_dept = spark.read.parquet("/home/jovyan/data/bronze/csv/departements")

dim_etablissement = df_etab.select(
    F.col("finess_site").alias("finess"),
    F.col("siret_site").alias("siret"),
    F.col("raison_sociale").alias("nom"),
    F.col("commune").alias("ville"),
    F.col("code_postal"),
    F.col("telephone"),
    F.col("email"),
    F.substring(F.col("code_postal"), 1, 2).alias("code_departement")
).filter(
    F.col("finess").isNotNull()
).dropDuplicates(["finess"]).join(
    df_dept.select(
        F.col("num_departement"),
        F.col("libelle_departement"),
        F.col("libelle_region"),
        F.col("abv_region")
    ),
    F.col("code_departement") == df_dept["num_departement"],
    "left"
)

print(f"{dim_etablissement.count():,} etablissements")
dim_etablissement.show(5, truncate=False)

dim_etablissement.write.mode("overwrite").parquet(f"{GOLD_OUTPUT}/dim_etablissement")
print(f"Saved: {GOLD_OUTPUT}/dim_etablissement")


DIMENSION: dim_etablissement
200 etablissements
+---------+-----+-------------------------------------+-------------------------+-----------+----------+-----+----------------+---------------+-------------------+--------------------+----------+
|finess   |siret|nom                                  |ville                    |code_postal|telephone |email|code_departement|num_departement|libelle_departement|libelle_region      |abv_region|
+---------+-----+-------------------------------------+-------------------------+-----------+----------+-----+----------------+---------------+-------------------+--------------------+----------+
|180036014|NULL |CHNO DES QUINZE-VINGTS PARIS         |PARIS 12E  ARRONDISSEMENT|75012      |0140021520|NULL |75              |75             |Paris              |Ile-de-France       |IDF       |
|200009181|NULL |CIAS AIME                            |AIME-LA-PLAGNE           |73211      |NULL      |NULL |73              |73             |Savoie             |Auve

## FAITS

In [8]:
# 6. FAIT_CONSULTATION
print("\n" + "="*80)
print("FAIT: fait_consultation")
print("="*80)

df = spark.read.parquet(f"{SILVER_BASE}/consultation")

fait_consultation = df.select(
    F.col("id_consultation"),
    F.col("id_patient"),
    F.col("id_professionnel").alias("id_prof"),
    F.col("id_diagnostic").alias("code_diag"),
    F.col("id_mutuelle"),
    F.date_format(F.col("date_consultation"), "yyyyMMdd").alias("id_temps"),
    F.col("date_consultation"),
    F.col("annee"),
    F.col("mois"),
    F.col("jour"),
    F.col("heure_debut"),
    F.col("heure_fin"),
    F.col("motif")
)

print(f"{fait_consultation.count():,} consultations")
fait_consultation.show(5)

fait_consultation.write.mode("overwrite").partitionBy("annee", "mois").parquet(f"{GOLD_OUTPUT}/fait_consultation")
print(f"Saved: {GOLD_OUTPUT}/fait_consultation (partitioned by annee, mois)")


FAIT: fait_consultation
1,027,157 consultations
+---------------+----------+-----------+---------+-----------+--------+-----------------+-----+----+----+-------------------+-------------------+-------------------+
|id_consultation|id_patient|    id_prof|code_diag|id_mutuelle|id_temps|date_consultation|annee|mois|jour|        heure_debut|          heure_fin|              motif|
+---------------+----------+-----------+---------+-----------+--------+-----------------+-----+----+----+-------------------+-------------------+-------------------+
|     1059023408|      1285|10101362548|   S92700|        243|20150620|       2015-06-20| 2015|   6|  20|1970-01-01 08:00:00|1970-01-01 12:00:00|       Consultation|
|     1059023414|      4709|10100154573|    M4140|        182|20150620|       2015-06-20| 2015|   6|  20|1970-01-01 08:00:00|1970-01-01 12:00:00|       Consultation|
|     1059023450|     14775|10002434180|     T733|        197|20150620|       2015-06-20| 2015|   6|  20|1970-01-01 08:00

In [9]:
# 7. FAIT_DECES
print("\n" + "="*80)
print("FAIT: fait_deces")
print("="*80)

df = spark.read.parquet(f"{SILVER_BASE}/deces_2019")

fait_deces = df.select(
    F.monotonically_increasing_id().alias("id_deces"),
    F.col("nom_hash"),
    F.col("prenom_hash"),
    F.col("acte_deces_hash"),
    F.col("sexe"),
    F.col("date_naissance"),
    F.col("date_deces"),
    F.col("age_deces"),
    F.date_format(F.col("date_deces"), "yyyyMMdd").alias("id_temps"),
    F.col("annee_deces").alias("annee"),
    F.col("mois_deces").alias("mois"),
    F.col("code_lieu_naissance"),
    F.col("lieu_naissance"),
    F.col("pays_naissance"),
    F.col("code_lieu_deces")
)

print(f"{fait_deces.count():,} deces")
fait_deces.show(5)

fait_deces.write.mode("overwrite").partitionBy("annee", "mois").parquet(f"{GOLD_OUTPUT}/fait_deces")
print(f"Saved: {GOLD_OUTPUT}/fait_deces (partitioned by annee, mois)")


FAIT: fait_deces
620,625 deces
+--------+--------------------+--------------------+--------------------+----+--------------+----------+---------+--------+-----+----+-------------------+--------------------+--------------+---------------+
|id_deces|            nom_hash|         prenom_hash|     acte_deces_hash|sexe|date_naissance|date_deces|age_deces|id_temps|annee|mois|code_lieu_naissance|      lieu_naissance|pays_naissance|code_lieu_deces|
+--------+--------------------+--------------------+--------------------+----+--------------+----------+---------+--------+-----+----+-------------------+--------------------+--------------+---------------+
|       0|58d6b0c06a55864a3...|a29c7480d5f59f49c...|19581e27de7ced00f...|   2|    1925-12-20|2019-01-05|       94|20190105| 2019|   1|              64259|             HELETTE|          NULL|          64160|
|       1|a0409b8b554db5961...|f2e78f624016938ca...|73475cb40a568e8da...|   2|    1952-11-04|2019-04-22|       67|20190422| 2019|   4|      

In [10]:
# 8. FAIT_HOSPITALISATION (depuis AAAA + date)
print("\n" + "="*80)
print("FAIT: fait_hospitalisation")
print("="*80)

df_aaaa = spark.read.parquet(f"{SILVER_BASE}/../bronze/postgres/AAAA").drop("ingestion_timestamp", "ingestion_date")
df_date = spark.read.parquet(f"{SILVER_BASE}/../bronze/postgres/date").drop("ingestion_timestamp", "ingestion_date")

df_aaaa_idx = df_aaaa.withColumn("row_id", F.monotonically_increasing_id())
df_date_idx = df_date.withColumn("row_id", F.monotonically_increasing_id())

df_hospit = df_aaaa_idx.join(df_date_idx, "row_id", "inner")

fait_hospitalisation = df_hospit.select(
    F.monotonically_increasing_id().alias("id_hospitalisation"),
    F.col("Num").alias("id_patient"),
    F.col("Code_diag").alias("code_diag"),
    F.to_date(F.col("date1"), "dd/MM/yyyy").alias("date_entree"),
    F.to_date(F.col("date2"), "dd/MM/yyyy").alias("date_sortie"),
    F.date_format(F.to_date(F.col("date1"), "dd/MM/yyyy"), "yyyyMMdd").alias("id_temps_entree"),
    F.date_format(F.to_date(F.col("date2"), "dd/MM/yyyy"), "yyyyMMdd").alias("id_temps_sortie"),
    F.datediff(F.to_date(F.col("date2"), "dd/MM/yyyy"), F.to_date(F.col("date1"), "dd/MM/yyyy")).alias("duree_sejour_jours"),
    F.year(F.to_date(F.col("date1"), "dd/MM/yyyy")).alias("annee"),
    F.month(F.to_date(F.col("date1"), "dd/MM/yyyy")).alias("mois")
).filter(
    (F.col("date_entree").isNotNull()) &
    (F.col("date_sortie").isNotNull()) &
    (F.col("duree_sejour_jours") >= 0)
)

print(f"{fait_hospitalisation.count():,} hospitalisations")
fait_hospitalisation.show(5)

fait_hospitalisation.write.mode("overwrite").partitionBy("annee", "mois").parquet(f"{GOLD_OUTPUT}/fait_hospitalisation")
print(f"Saved: {GOLD_OUTPUT}/fait_hospitalisation (partitioned by annee, mois)")


FAIT: fait_hospitalisation
82,216 hospitalisations
+------------------+----------+---------+-----------+-----------+---------------+---------------+------------------+-----+----+
|id_hospitalisation|id_patient|code_diag|date_entree|date_sortie|id_temps_entree|id_temps_sortie|duree_sejour_jours|annee|mois|
+------------------+----------+---------+-----------+-----------+---------------+---------------+------------------+-----+----+
|                 0|         1|     Q428| 2018-12-01| 2018-12-02|       20181201|       20181202|                 1| 2018|  12|
|                 1|         2|     G961| 2019-03-12| 2019-03-13|       20190312|       20190313|                 1| 2019|   3|
|                 2|         3|     J350| 2015-12-27| 2015-12-28|       20151227|       20151228|                 1| 2015|  12|
|                 3|         4|     P569| 2017-09-20| 2017-09-21|       20170920|       20170921|                 1| 2017|   9|
|                 4|         5|    M0217| 2021-04-06

In [11]:
# 9. FAIT_SATISFACTION
print("\n" + "="*80)
print("FAIT: fait_satisfaction")
print("="*80)

df = spark.read.parquet(f"{SILVER_BASE}/satisfaction_2019")

fait_satisfaction = df.select(
    F.monotonically_increasing_id().alias("id_satisfaction"),
    F.col("finess"),
    F.lit("20190101").alias("id_temps"),
    F.col("annee"),
    F.col("score_global"),
    F.col("score_accueil"),
    F.col("score_pec_infirmier"),
    F.col("score_pec_medical"),
    F.col("score_chambre"),
    F.col("score_repas"),
    F.col("score_sortie"),
    F.col("taux_recommandation"),
    F.col("nb_reponses_global").alias("nb_repondants"),
    F.col("nb_recommandations"),
    F.col("classement"),
    F.col("evolution")
)

print(f"{fait_satisfaction.count():,} evaluations")
fait_satisfaction.show(5)

fait_satisfaction.write.mode("overwrite").partitionBy("annee").parquet(f"{GOLD_OUTPUT}/fait_satisfaction")
print(f"Saved: {GOLD_OUTPUT}/fait_satisfaction (partitioned by annee)")


FAIT: fait_satisfaction
8 evaluations
+---------------+---------+--------+-----+------------+-------------+-------------------+-----------------+-------------+-----------+------------+-------------------+-------------+------------------+----------+------------+
|id_satisfaction|   finess|id_temps|annee|score_global|score_accueil|score_pec_infirmier|score_pec_medical|score_chambre|score_repas|score_sortie|taux_recommandation|nb_repondants|nb_recommandations|classement|   evolution|
+---------------+---------+--------+-----+------------+-------------+-------------------+-----------------+-------------+-----------+------------+-------------------+-------------+------------------+----------+------------+
|              0|070780358|20190101| 2019|        71.0|        71.15|              79.19|            76.45|         68.8|      57.64|       61.17|               41.0|          307|               305|         C|1-Diminution|
|              1|180000358|20190101| 2019|        76.0|        78

## VERIFICATION

In [12]:
# Inventaire Gold
import os

print("\n" + "="*80)
print("GOLD LAYER - INVENTAIRE")
print("="*80)

for table in sorted(os.listdir(GOLD_OUTPUT)):
    path = f"{GOLD_OUTPUT}/{table}"
    try:
        df = spark.read.parquet(path)
        count = df.count()
        cols = len(df.columns)
        table_type = "DIM" if table.startswith("dim_") else "FAIT"
        print(f"{table_type:4s} | {table:30s} | {count:>10,} rows | {cols:>2} cols")
    except Exception as e:
        print(f"ERROR | {table:30s} | {str(e)[:50]}")

print("="*80)
print("GOLD LAYER COMPLETE!")
print("="*80)


GOLD LAYER - INVENTAIRE
DIM  | dim_diagnostic                 |     15,490 rows |  3 cols
DIM  | dim_etablissement              |        200 rows | 12 cols
DIM  | dim_patient                    |    100,000 rows | 10 cols
DIM  | dim_professionnel              |  1,048,575 rows |  5 cols
DIM  | dim_temps                      |      4,748 rows |  9 cols
FAIT | fait_consultation              |  1,027,157 rows | 13 cols
FAIT | fait_deces                     |    620,625 rows | 15 cols
FAIT | fait_hospitalisation           |     82,216 rows | 10 cols
FAIT | fait_satisfaction              |          8 rows | 16 cols
GOLD LAYER COMPLETE!


In [13]:
spark.stop()
print("Spark stopped")

Spark stopped
