In [0]:
# Databricks Notebook: Bronze SBB Ingestion
# Objective: read JSON from ADLS Gen2 and create Bronze in Delta Lake

# -------------------------------
# 1️⃣ Import SparkSession
# -------------------------------
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [0]:
# -------------------------------
# 2️⃣ Define ADLS variables
# -------------------------------
storage_account = "sbbapistorageaccount"
container = "data-container"
account_key = ""

# Configure Spark which allows it to access the ADLS Gen2 storage account
spark.conf.set(
    f"fs.azure.account.key.{storage_account}.dfs.core.windows.net",
    account_key
)

# Path of JSON file
adls_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/data/sbb.json"
print("Chemin JSON :", adls_path)

In [0]:

# -------------------------------
# 3️⃣ Read JSON
# -------------------------------
# Using badRecordsPath option to catch the errors
df_raw = spark.read.option("multiLine", True).json(adls_path)

# Verify 5 first rows and schema
df_raw.show(5, truncate=False)
df_raw.printSchema()


In [0]:
from pyspark.sql.functions import explode, col

df_clean = df_raw.select(
    explode(col("results")).alias("result"),  # converts each element of the array into a line
    col("total_count")
)

# Now that result is a struct, we can extract its fields
df_clean = df_clean.select(
    col("result.*"), 
    col("total_count")
)

df_clean.show(1, truncate=False)
#df_clean.printSchema()

In [0]:
# -------------------------------
# 5️⃣ Define Bronze path
# -------------------------------
bronze_path = f"abfss://{container}@{storage_account}.dfs.core.windows.net/bronze/sbb"
print("Chemin Bronze :", bronze_path)

In [0]:
# -------------------------------
# 6️⃣ Write Bronze in Delta Lake
# -------------------------------
df_clean.write.format("delta").option("mergeSchema", "true") \
    .mode("overwrite") \
    .save(bronze_path)

print("✅ Bronze SBB créée avec succès")

In [0]:
# -------------------------------
# 7️⃣ Verification
# -------------------------------
df_bronze = spark.read.format("delta").load(bronze_path)
df_bronze.show(3, truncate=False)
#df_bronze.printSchema()

In [0]:
# -------------------------------
# 8️⃣ Optionnal : Create Bronze SQL table
# -------------------------------
spark.sql(f"""
CREATE TABLE IF NOT EXISTS bronze_sbb
USING DELTA
LOCATION '{bronze_path}'
""")
print("✅ Table SQL bronze_sbb créée")

In [0]:
spark.sql(f"""
SELECT *
FROM bronze_sbb
LIMIT 3
""").show(truncate=False)