In [1]:
from pyspark.sql.functions import col, lit, concat, explode
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, ArrayType

In [None]:
spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "C:/Users/ACER SWIFT X/Desktop/Projects/F1/f1-de-gcp-7735b3a7392e.json")


In [3]:
# Define the schema
drivers_schema = StructType([StructField("season", StringType(), False),
                             StructField("Drivers", ArrayType(StructType([StructField("driverId", StringType(), False),
                                                                          StructField("permanentNumber", StringType(), True),
                                                                          StructField("code", StringType(), True),
                                                                          StructField("url", StringType(), True),
                                                                          StructField("givenName", StringType(), True),
                                                                          StructField("familyName", StringType(), True),
                                                                          StructField("dateOfBirth", StringType(), True),
                                                                          StructField("nationality", StringType(), True)
                                                                        ])), True)
])

In [4]:
drivers_df = spark.read \
.schema(drivers_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/drivers.json")

In [5]:
drivers_df.printSchema()
drivers_df.show(truncate=False)

root
 |-- season: string (nullable = true)
 |-- Drivers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- driverId: string (nullable = true)
 |    |    |-- permanentNumber: string (nullable = true)
 |    |    |-- code: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- givenName: string (nullable = true)
 |    |    |-- familyName: string (nullable = true)
 |    |    |-- dateOfBirth: string (nullable = true)
 |    |    |-- nationality: string (nullable = true)

+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Step 4: Explode the "Drivers" array into multiple rows
drivers_exploded_df = drivers_df.select("season", explode("Drivers").alias("Drivers"))
drivers_exploded_df.show(truncate=False)

+------+----------------------------------------------------------------------------------------------------------------------+
|season|Drivers                                                                                                               |
+------+----------------------------------------------------------------------------------------------------------------------+
|1950  |{ader, NULL, NULL, http://en.wikipedia.org/wiki/Walt_Ader, Walt, Ader, 1913-12-15, American}                          |
|1950  |{agabashian, NULL, NULL, http://en.wikipedia.org/wiki/Fred_Agabashian, Fred, Agabashian, 1913-08-21, American}        |
|1950  |{ascari, NULL, NULL, http://en.wikipedia.org/wiki/Alberto_Ascari, Alberto, Ascari, 1918-07-13, Italian}               |
|1950  |{banks, NULL, NULL, http://en.wikipedia.org/wiki/Henry_Banks, Henry, Banks, 1913-06-14, American}                     |
|1950  |{bettenhausen, NULL, NULL, http://en.wikipedia.org/wiki/Tony_Bettenhausen, Tony, Bettenhausen, 1

In [7]:
# Cast the columns
drivers_cast_df = drivers_exploded_df.withColumn("driver_id", col("Drivers.driverId")) \
                                    .withColumn("permanent_number", col("Drivers.permanentNumber").cast(IntegerType())) \
                                    .withColumn("code", col("Drivers.code")) \
                                    .withColumn("url", col("Drivers.url")) \
                                    .withColumn("given_name", col("Drivers.givenName")) \
                                    .withColumn("family_name", col("Drivers.familyName")) \
                                    .withColumn("date_of_birth", col("Drivers.dateOfBirth").cast(DateType())) \
                                    .withColumn("nationality", col("Drivers.nationality")) \
                                    .withColumn("driver_name", concat(col("given_name"), lit(" "), col("family_name")))\
                                    .drop("Drivers")
drivers_cast_df.show(truncate=False)

+------+-------------+----------------+----+----------------------------------------------------+----------+------------+-------------+-----------+------------------+
|season|driver_id    |permanent_number|code|url                                                 |given_name|family_name |date_of_birth|nationality|driver_name       |
+------+-------------+----------------+----+----------------------------------------------------+----------+------------+-------------+-----------+------------------+
|1950  |ader         |NULL            |NULL|http://en.wikipedia.org/wiki/Walt_Ader              |Walt      |Ader        |1913-12-15   |American   |Walt Ader         |
|1950  |agabashian   |NULL            |NULL|http://en.wikipedia.org/wiki/Fred_Agabashian        |Fred      |Agabashian  |1913-08-21   |American   |Fred Agabashian   |
|1950  |ascari       |NULL            |NULL|http://en.wikipedia.org/wiki/Alberto_Ascari         |Alberto   |Ascari      |1918-07-13   |Italian    |Alberto Ascari    

In [8]:
# Select the final columns (can also use .drop() to remove columns)
drivers_final_df = drivers_cast_df.select(col("season").cast(IntegerType()).alias("season"), col("driver_id"), col("permanent_number"), col("code"), col("driver_name"), col("date_of_birth"), col("nationality"))
drivers_final_df.show(truncate=False)

+------+-------------+----------------+----+------------------+-------------+-----------+
|season|driver_id    |permanent_number|code|driver_name       |date_of_birth|nationality|
+------+-------------+----------------+----+------------------+-------------+-----------+
|1950  |ader         |NULL            |NULL|Walt Ader         |1913-12-15   |American   |
|1950  |agabashian   |NULL            |NULL|Fred Agabashian   |1913-08-21   |American   |
|1950  |ascari       |NULL            |NULL|Alberto Ascari    |1918-07-13   |Italian    |
|1950  |banks        |NULL            |NULL|Henry Banks       |1913-06-14   |American   |
|1950  |bettenhausen |NULL            |NULL|Tony Bettenhausen |1916-09-12   |American   |
|1950  |biondetti    |NULL            |NULL|Clemente Biondetti|1898-08-18   |Italian    |
|1950  |bira         |NULL            |NULL|Prince Bira       |1914-07-15   |Thai       |
|1950  |bonetto      |NULL            |NULL|Felice Bonetto    |1903-06-09   |Italian    |
|1950  |br

In [9]:
drivers_final_df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- driver_id: string (nullable = true)
 |-- permanent_number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- driver_name: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- nationality: string (nullable = true)



In [10]:
drivers_final_df.write.mode("overwrite").format("parquet").save("gs://f1-gcp/processed/drivers/")

In [11]:
spark.read.parquet("gs://f1-gcp/processed/drivers/").show()

+------+-------------+----------------+----+------------------+-------------+-----------+
|season|    driver_id|permanent_number|code|       driver_name|date_of_birth|nationality|
+------+-------------+----------------+----+------------------+-------------+-----------+
|  1950|         ader|            NULL|NULL|         Walt Ader|   1913-12-15|   American|
|  1950|   agabashian|            NULL|NULL|   Fred Agabashian|   1913-08-21|   American|
|  1950|       ascari|            NULL|NULL|    Alberto Ascari|   1918-07-13|    Italian|
|  1950|        banks|            NULL|NULL|       Henry Banks|   1913-06-14|   American|
|  1950| bettenhausen|            NULL|NULL| Tony Bettenhausen|   1916-09-12|   American|
|  1950|    biondetti|            NULL|NULL|Clemente Biondetti|   1898-08-18|    Italian|
|  1950|         bira|            NULL|NULL|       Prince Bira|   1914-07-15|       Thai|
|  1950|      bonetto|            NULL|NULL|    Felice Bonetto|   1903-06-09|    Italian|
|  1950|  

In [12]:
spark.stop()