In [1]:
from pyspark.sql.functions import col, lit, concat
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [None]:
spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "path/to/service-account-file.json")


In [3]:
# Define the schema
drivers_schema = StructType([StructField("driverId", StringType(), False),
                             StructField("permanentNumber", StringType(), True),
                             StructField("code", StringType(), True),
                             StructField("url", StringType(), True),
                             StructField("givenName", StringType(), True),
                             StructField("familyName", StringType(), True),
                             StructField("dateOfBirth", StringType(), True),
                             StructField("nationality", StringType(), True)
])

In [4]:
drivers_df = spark.read \
.schema(drivers_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/drivers.json")

In [5]:
drivers_df.printSchema()
drivers_df.show(truncate=False)

root
 |-- driverId: string (nullable = true)
 |-- permanentNumber: string (nullable = true)
 |-- code: string (nullable = true)
 |-- url: string (nullable = true)
 |-- givenName: string (nullable = true)
 |-- familyName: string (nullable = true)
 |-- dateOfBirth: string (nullable = true)
 |-- nationality: string (nullable = true)

+------------+---------------+----+----------------------------------------------+----------+-----------+-----------+-----------+
|driverId    |permanentNumber|code|url                                           |givenName |familyName |dateOfBirth|nationality|
+------------+---------------+----+----------------------------------------------+----------+-----------+-----------+-----------+
|abate       |NULL           |NULL|http://en.wikipedia.org/wiki/Carlo_Mario_Abate|Carlo     |Abate      |1932-07-10 |Italian    |
|abecassis   |NULL           |NULL|http://en.wikipedia.org/wiki/George_Abecassis |George    |Abecassis  |1913-03-21 |British    |
|acheson     |NUL

In [9]:
drivers_df = drivers_df.drop(col("url"))\
                                .withColumn("name", concat(col("givenName"), lit(" "), col("familyName")))
drivers_df.show(truncate=False)

+------------+---------------+----+----------+-----------+-----------+-----------+-----------------+
|driverId    |permanentNumber|code|givenName |familyName |dateOfBirth|nationality|name             |
+------------+---------------+----+----------+-----------+-----------+-----------+-----------------+
|abate       |NULL           |NULL|Carlo     |Abate      |1932-07-10 |Italian    |Carlo Abate      |
|abecassis   |NULL           |NULL|George    |Abecassis  |1913-03-21 |British    |George Abecassis |
|acheson     |NULL           |NULL|Kenny     |Acheson    |1957-11-27 |British    |Kenny Acheson    |
|adams       |NULL           |NULL|Philippe  |Adams      |1969-11-19 |Belgian    |Philippe Adams   |
|ader        |NULL           |NULL|Walt      |Ader       |1913-12-15 |American   |Walt Ader        |
|adolff      |NULL           |NULL|Kurt      |Adolff     |1921-11-05 |German     |Kurt Adolff      |
|agabashian  |NULL           |NULL|Fred      |Agabashian |1913-08-21 |American   |Fred Agab

In [10]:
# Select the final columns (can also use .drop() to remove columns)
drivers_final_df = drivers_df.select(col("driverId").alias("driver_id"),
                                     col("permanentNumber").cast(IntegerType()).alias("permanent_number"),
                                     col("code"),
                                     col("name"), 
                                     col("nationality"),
                                     col("dateOfBirth").cast(DateType()).alias("date_of_birth"))
drivers_final_df.show(truncate=False)

+------------+----------------+----+-----------------+-----------+-------------+
|driver_id   |permanent_number|code|name             |nationality|date_of_birth|
+------------+----------------+----+-----------------+-----------+-------------+
|abate       |NULL            |NULL|Carlo Abate      |Italian    |1932-07-10   |
|abecassis   |NULL            |NULL|George Abecassis |British    |1913-03-21   |
|acheson     |NULL            |NULL|Kenny Acheson    |British    |1957-11-27   |
|adams       |NULL            |NULL|Philippe Adams   |Belgian    |1969-11-19   |
|ader        |NULL            |NULL|Walt Ader        |American   |1913-12-15   |
|adolff      |NULL            |NULL|Kurt Adolff      |German     |1921-11-05   |
|agabashian  |NULL            |NULL|Fred Agabashian  |American   |1913-08-21   |
|ahrens      |NULL            |NULL|Kurt Ahrens      |German     |1940-04-19   |
|aitken      |89              |AIT |Jack Aitken      |British    |1995-09-23   |
|albers      |NULL          

In [11]:
drivers_final_df.printSchema()

root
 |-- driver_id: string (nullable = true)
 |-- permanent_number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- date_of_birth: date (nullable = true)



In [12]:
drivers_final_df.write.mode("overwrite").format("parquet").save("gs://f1-gcp/processed/drivers/")

In [13]:
spark.read.parquet("gs://f1-gcp/processed/drivers/").show()

+------------+----------------+----+-----------------+-----------+-------------+
|   driver_id|permanent_number|code|             name|nationality|date_of_birth|
+------------+----------------+----+-----------------+-----------+-------------+
|       abate|            NULL|NULL|      Carlo Abate|    Italian|   1932-07-10|
|   abecassis|            NULL|NULL| George Abecassis|    British|   1913-03-21|
|     acheson|            NULL|NULL|    Kenny Acheson|    British|   1957-11-27|
|       adams|            NULL|NULL|   Philippe Adams|    Belgian|   1969-11-19|
|        ader|            NULL|NULL|        Walt Ader|   American|   1913-12-15|
|      adolff|            NULL|NULL|      Kurt Adolff|     German|   1921-11-05|
|  agabashian|            NULL|NULL|  Fred Agabashian|   American|   1913-08-21|
|      ahrens|            NULL|NULL|      Kurt Ahrens|     German|   1940-04-19|
|      aitken|              89| AIT|      Jack Aitken|    British|   1995-09-23|
|      albers|            NU

In [14]:
spark.stop()