In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, explode
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

# Initialiser la session Spark
spark = SparkSession.builder \
    .appName("BatchProcessing") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

# Définir le schéma pour les données JSON
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("origin_id", IntegerType(), True),
    StructField("price_cents", IntegerType(), True),
    StructField("price_currency", StringType(), True),
    StructField("schedule", StringType(), True),
    StructField("updated_at", StringType(), True),
    StructField("timestamp_ingestion", StringType(), True),
    StructField("legs", ArrayType(
        StructType([
            StructField("schedule", StringType(), True),
            StructField("service_type", StringType(), True),
            StructField("arrival", StringType(), True),
            StructField("service_name", StringType(), True),
            StructField("one_luggage", StringType(), True),
            StructField("destination_id", StringType(), True),
            StructField("origin_id", StringType(), True),
            StructField("departure", StringType(), True),
            StructField("via_stations", StringType(), True),
            StructField("bus_number", StringType(), True),
        ])
    ), True)
])

data = spark.read.json("hdfs://namenode:9000/data/data_bbc/*", schema=schema)

data = data.withColumn("leg", explode("legs"))

data = data.withColumn("schedule", to_timestamp(col("schedule")))
data = data.withColumn("updated_at", to_timestamp(col("updated_at")))
data = data.withColumn("timestamp_ingestion", to_timestamp(col("timestamp_ingestion")))
data = data.withColumn("leg.schedule", to_timestamp(col("leg.schedule")))
data = data.withColumn("leg.arrival", to_timestamp(col("leg.arrival")))
data = data.withColumn("leg.departure", to_timestamp(col("leg.departure")))

transformed_data = data.select(
    col("id"),
    col("origin_id"),
    col("price_cents"),
    col("price_currency"),
    col("schedule"),
    col("updated_at"),
    col("timestamp_ingestion"),
    col("leg.schedule").alias("leg_schedule"),
    col("leg.service_type").alias("service_type"),
    col("leg.destination_id").alias("destination_id") ,  
    col("leg.service_name").alias("service_name"),
    col("leg.bus_number").alias("bus_number")
)

transformed_data.write.mode("overwrite").parquet("hdfs://namenode:9000/data/data_trans/")

spark.stop()

