In [10]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.Builder().appName("automatic backfill").getOrCreate()

In [11]:
from pyspark.sql.types import StructType, StructField, TimestampType, FloatType, StringType

schema = StructType([
    StructField("datetime", TimestampType(), True),
    StructField("temperature", FloatType(), True),
    StructField("humidity", FloatType(), True),
    StructField("file_path", StringType(), True)
])

In [12]:
from pyspark.sql.functions import input_file_name

# Read all raw incoming CSV files from the directory
raw_df = spark.read.option("header", "true").schema(schema).csv("./automatic_backfill/data/raw/*.csv")

# Add respective file path to each row
raw_df = raw_df.withColumn("file_path", input_file_name())

# Show the raw data
raw_df.show(5, truncate=False)

+-------------------+-----------+--------+------------------------------------------------------------+
|datetime           |temperature|humidity|file_path                                                   |
+-------------------+-----------+--------+------------------------------------------------------------+
|2025-05-01 00:00:00|18.5       |65.2    |file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
|2025-05-01 01:00:00|17.9       |67.5    |file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
|2025-05-01 02:00:00|17.2       |69.8    |file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
|2025-05-01 03:00:00|16.8       |71.0    |file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
|2025-05-01 04:00:00|16.5       |72.3    |file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
+-------------------+-----------+--------+------------------------------------------------------------+
only showing top 5 rows



In [13]:
# Read all bronze CSV files
bronze_df = spark.read.csv("./automatic_backfill/data/bronze/", header=True, schema=schema)

# Get distinct file paths
bronze_df = bronze_df.select("file_path").distinct()

bronze_df.show(truncate=False)

+------------------------------------------------------------+
|file_path                                                   |
+------------------------------------------------------------+
|file:///workspace/automatic_backfill/data/raw/2025-05-03.csv|
|file:///workspace/automatic_backfill/data/raw/2025-05-01.csv|
+------------------------------------------------------------+



In [14]:
# df.write.mode("overwrite").csv("./automatic_backfill/data/bronze/")