In [1]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

In [None]:
spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "path/to/service-account-file.json")


In [3]:
# Define the schema for the nested "Location" field
location_schema = StructType(fields=[StructField("lat", StringType(), True),
                                     StructField("long", StringType(), True),
                                     StructField("locality", StringType(), True),
                                     StructField("country", StringType(), True)
])

circuits_schema = StructType(fields=[StructField("circuitId", StringType(), True),
                                     StructField("url", StringType(), True),
                                     StructField("circuitName", StringType(), True),
                                     StructField("Location", location_schema)  # Nested schema
])

races_schema = StructType(fields=[StructField("season", StringType(), False),
                                     StructField("round", StringType(), True),
                                     StructField("url", StringType(), True),
                                     StructField("raceName", StringType(), True),  # Nested schema
                                     StructField("Circuit", circuits_schema),  # Nested schema
                                     StructField("date", StringType(), True)
])

In [4]:
races_df = spark.read \
.schema(races_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/races.json")

In [5]:
races_df.printSchema()
races_df.show(truncate=False)

root
 |-- season: string (nullable = true)
 |-- round: string (nullable = true)
 |-- url: string (nullable = true)
 |-- raceName: string (nullable = true)
 |-- Circuit: struct (nullable = true)
 |    |-- circuitId: string (nullable = true)
 |    |-- url: string (nullable = true)
 |    |-- circuitName: string (nullable = true)
 |    |-- Location: struct (nullable = true)
 |    |    |-- lat: string (nullable = true)
 |    |    |-- long: string (nullable = true)
 |    |    |-- locality: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |-- date: string (nullable = true)

+------+-----+-----------------------------------------------------+------------------+---------------------------------------------------------------------------------------------------------------------------------------------+----------+
|season|round|url                                                  |raceName          |Circuit                                                                 

In [6]:
# Cast the fields to the correct data types
races_cast_df = races_df.withColumn("latitude", col("Circuit.Location.lat").cast(DoubleType())) \
    .withColumn("longitude", col("Circuit.Location.long").cast(DoubleType())) \
    .withColumn("locality", col("Circuit.Location.locality")) \
    .withColumn("country", col("Circuit.Location.country")) \
    .drop("Location")\
    .withColumn("circuit_id", col("Circuit.circuitId")) \
    .withColumn("circuit_url", col("Circuit.url")) \
    .withColumn("circuit_name", col("Circuit.circuitName")) \
    .drop("Circuit")
races_cast_df.show(truncate=False)

+------+-----+-----------------------------------------------------+------------------+----------+------------------+---------+------------+-----------+------------+----------------------------------------------------------+----------------------------+
|season|round|url                                                  |raceName          |date      |latitude          |longitude|locality    |country    |circuit_id  |circuit_url                                               |circuit_name                |
+------+-----+-----------------------------------------------------+------------------+----------+------------------+---------+------------+-----------+------------+----------------------------------------------------------+----------------------------+
|1950  |1    |https://en.wikipedia.org/wiki/1950_British_Grand_Prix|British Grand Prix|1950-05-13|52.0786           |-1.01694 |Silverstone |UK         |silverstone |https://en.wikipedia.org/wiki/Silverstone_Circuit         |Silverstone Ci

In [7]:
# Rename the columns
races_renamed_df = races_cast_df.withColumnRenamed("raceName", "race_name")\
                                .withColumnRenamed("date", "race_date")
races_renamed_df.show(truncate=False)

+------+-----+-----------------------------------------------------+------------------+----------+------------------+---------+------------+-----------+------------+----------------------------------------------------------+----------------------------+
|season|round|url                                                  |race_name         |race_date |latitude          |longitude|locality    |country    |circuit_id  |circuit_url                                               |circuit_name                |
+------+-----+-----------------------------------------------------+------------------+----------+------------------+---------+------------+-----------+------------+----------------------------------------------------------+----------------------------+
|1950  |1    |https://en.wikipedia.org/wiki/1950_British_Grand_Prix|British Grand Prix|1950-05-13|52.0786           |-1.01694 |Silverstone |UK         |silverstone |https://en.wikipedia.org/wiki/Silverstone_Circuit         |Silverstone Ci

In [8]:
# Select the final columns
races_final_df = races_renamed_df.select(col("season").cast(IntegerType()).alias("season"), col("round").cast(IntegerType()).alias("round"), col("race_name"), col("race_date").cast(DateType()).alias("race_date"))
races_final_df.show(truncate=False)

+------+-----+------------------+----------+
|season|round|race_name         |race_date |
+------+-----+------------------+----------+
|1950  |1    |British Grand Prix|1950-05-13|
|1950  |2    |Monaco Grand Prix |1950-05-21|
|1950  |3    |Indianapolis 500  |1950-05-30|
|1950  |4    |Swiss Grand Prix  |1950-06-04|
|1950  |5    |Belgian Grand Prix|1950-06-18|
|1950  |6    |French Grand Prix |1950-07-02|
|1950  |7    |Italian Grand Prix|1950-09-03|
|1951  |1    |Swiss Grand Prix  |1951-05-27|
|1951  |2    |Indianapolis 500  |1951-05-30|
|1951  |3    |Belgian Grand Prix|1951-06-17|
|1951  |4    |French Grand Prix |1951-07-01|
|1951  |5    |British Grand Prix|1951-07-14|
|1951  |6    |German Grand Prix |1951-07-29|
|1951  |7    |Italian Grand Prix|1951-09-16|
|1951  |8    |Spanish Grand Prix|1951-10-28|
|1952  |1    |Swiss Grand Prix  |1952-05-18|
|1952  |2    |Indianapolis 500  |1952-05-30|
|1952  |3    |Belgian Grand Prix|1952-06-22|
|1952  |4    |French Grand Prix |1952-07-06|
|1952  |5 

In [9]:
races_final_df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- round: integer (nullable = true)
 |-- race_name: string (nullable = true)
 |-- race_date: date (nullable = true)



In [10]:
races_final_df.write.mode("overwrite").format("parquet").save("gs://f1-gcp/processed/races/")

In [11]:
spark.read.parquet("gs://f1-gcp/processed/races/").show()

+------+-----+------------------+----------+
|season|round|         race_name| race_date|
+------+-----+------------------+----------+
|  1950|    1|British Grand Prix|1950-05-13|
|  1950|    2| Monaco Grand Prix|1950-05-21|
|  1950|    3|  Indianapolis 500|1950-05-30|
|  1950|    4|  Swiss Grand Prix|1950-06-04|
|  1950|    5|Belgian Grand Prix|1950-06-18|
|  1950|    6| French Grand Prix|1950-07-02|
|  1950|    7|Italian Grand Prix|1950-09-03|
|  1951|    1|  Swiss Grand Prix|1951-05-27|
|  1951|    2|  Indianapolis 500|1951-05-30|
|  1951|    3|Belgian Grand Prix|1951-06-17|
|  1951|    4| French Grand Prix|1951-07-01|
|  1951|    5|British Grand Prix|1951-07-14|
|  1951|    6| German Grand Prix|1951-07-29|
|  1951|    7|Italian Grand Prix|1951-09-16|
|  1951|    8|Spanish Grand Prix|1951-10-28|
|  1952|    1|  Swiss Grand Prix|1952-05-18|
|  1952|    2|  Indianapolis 500|1952-05-30|
|  1952|    3|Belgian Grand Prix|1952-06-22|
|  1952|    4| French Grand Prix|1952-07-06|
|  1952|  

In [12]:
spark.stop()