In [1]:
from core.spark_session import create_spark_session

DATE_PATH = "2026/02/04"

spark = create_spark_session()

df = (
    spark.read
    .option("multiLine", True)
    .json(f"s3a://data-lake/bronze/adzuna/{DATE_PATH}/*.json")
)

df.show(5)


+--------------------+--------------------+------------+--------------------+------+
|            batch_id|         ingested_at|record_count|             records|source|
+--------------------+--------------------+------------+--------------------+------+
|4e8e87b3-08a2-4f6...|2026-02-04T07:41:...|         100|[{Adzuna::API::Re...|adzuna|
|18b0f14f-f1de-4cf...|2026-02-04T07:41:...|         100|[{Adzuna::API::Re...|adzuna|
|6512566d-4ff3-45b...|2026-02-04T07:41:...|          50|[{Adzuna::API::Re...|adzuna|
+--------------------+--------------------+------------+--------------------+------+



In [4]:
from pyspark.sql.functions import explode, col

jobs_df = (
    df
    .filter(col("records").isNotNull())
    .select(explode("records").alias("job"))
)
jobs_df.printSchema()

root
 |-- job: struct (nullable = true)
 |    |-- __CLASS__: string (nullable = true)
 |    |-- adref: string (nullable = true)
 |    |-- category: struct (nullable = true)
 |    |    |-- __CLASS__: string (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- tag: string (nullable = true)
 |    |-- company: struct (nullable = true)
 |    |    |-- __CLASS__: string (nullable = true)
 |    |    |-- display_name: string (nullable = true)
 |    |-- contract_time: string (nullable = true)
 |    |-- contract_type: string (nullable = true)
 |    |-- created: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- location: struct (nullable = true)
 |    |    |-- __CLASS__: string (nullable = true)
 |    |    |-- area: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- display_name: string (nullable = true)
 |    

In [5]:
jobs_df.select(
      "job.title",
      "job.company.display_name",
      "job.location.display_name",
      "job.salary_min",
      "job.salary_max",
      "job.contract_type"
  ) \
  .show(10, truncate=False)


+---------------------------------------------------+----------------------+----------------------------------+----------+----------+-------------+
|title                                              |display_name          |display_name                      |salary_min|salary_max|contract_type|
+---------------------------------------------------+----------------------+----------------------------------+----------+----------+-------------+
|SEN / SEND Tutor, Staffordshire                    |Targeted Provision Ltd|Rolleston-On-Dove, Burton-On-Trent|58240.0   |72800.0   |contract     |
|Mobile Vehicle Technician - Carlisle               |RAC                   |Blencogo, Wigton                  |0.0       |50000.0   |NULL         |
|Mobile Mechanic                                    |RAC                   |Blencogo, Wigton                  |0.0       |50000.0   |NULL         |
|Mobile Vehicle Technician - Oxford                 |RAC                   |Crownhill, Milton Keynes          |0

In [7]:
from pyspark.sql.functions import col, count, when

check_null_job_df = (
    df
    .select(explode("records").alias("job"))
    .select("job.*")
)

null_stats = check_null_job_df.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in check_null_job_df.columns
])

null_stats.show(truncate=False)

+---------+-----+--------+-------+-------------+-------------+-------+-----------+---+--------+--------+---------+------------+-------------------+----------+----------+-----+
|__CLASS__|adref|category|company|contract_time|contract_type|created|description|id |latitude|location|longitude|redirect_url|salary_is_predicted|salary_max|salary_min|title|
+---------+-----+--------+-------+-------------+-------------+-------+-----------+---+--------+--------+---------+------------+-------------------+----------+----------+-----+
|0        |0    |0       |0      |86           |191          |0      |0          |0  |10      |0       |10       |0           |0                  |0         |0         |0    |
+---------+-----+--------+-------+-------------+-------------+-------+-----------+---+--------+--------+---------+------------+-------------------+----------+----------+-----+

