In [1]:
from pyspark.sql.functions import col, explode
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, ArrayType

In [None]:
spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "C:/Users/ACER SWIFT X/Desktop/Projects/F1/f1-de-gcp-7735b3a7392e.json")


In [3]:
# Define the schema for the nested "Location" field
constructor_schema = StructType(fields=[StructField("constructorId", StringType(), False),
                                     StructField("url", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("nationality", StringType(), True)
])

constructors_schema = StructType(fields=[StructField("season", StringType(), False),
                                     StructField("Constructors", ArrayType(constructor_schema))
])

In [4]:
constructors_df = spark.read \
.schema(constructors_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/constructors.json")

In [5]:
constructors_df.printSchema()
constructors_df.show(truncate=False)

root
 |-- season: string (nullable = true)
 |-- Constructors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- constructorId: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- nationality: string (nullable = true)

+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
constructors_exploded_df = constructors_df.select(col("season").cast(IntegerType()).alias("season"),
                                        explode("Constructors").alias("Constructors")
                                        )

constructors_exploded_df.show(truncate=False)

+------+-----------------------------------------------------------------------------------+
|season|Constructors                                                                       |
+------+-----------------------------------------------------------------------------------+
|1950  |{adams, http://en.wikipedia.org/wiki/Adams_(constructor), Adams, American}         |
|1950  |{alfa, http://en.wikipedia.org/wiki/Alfa_Romeo_in_Formula_One, Alfa Romeo, Swiss}  |
|1950  |{alta, http://en.wikipedia.org/wiki/Alta_auto_racing_team, Alta, British}          |
|1950  |{cooper, http://en.wikipedia.org/wiki/Cooper_Car_Company, Cooper, British}         |
|1950  |{deidt, http://en.wikipedia.org/wiki/Deidt, Deidt, American}                       |
|1950  |{era, http://en.wikipedia.org/wiki/English_Racing_Automobiles, ERA, British}       |
|1950  |{ewing, http://en.wikipedia.org/wiki/Ewing_(constructor), Ewing, American}         |
|1950  |{ferrari, http://en.wikipedia.org/wiki/Scuderia_Ferrari, Ferra

In [7]:
# Cast the columns
constructors_cast_df = constructors_exploded_df.withColumn("constructor_id", col("Constructors.constructorId")) \
                                                .withColumn("constructor_url", col("Constructors.url")) \
                                                .withColumn("constructor_name", col("Constructors.name")) \
                                                .withColumn("constructor_nationality", col("Constructors.nationality")) \
                                                .drop("Constructors")
constructors_cast_df.show(truncate=False)

+------+--------------+-------------------------------------------------------+----------------+-----------------------+
|season|constructor_id|constructor_url                                        |constructor_name|constructor_nationality|
+------+--------------+-------------------------------------------------------+----------------+-----------------------+
|1950  |adams         |http://en.wikipedia.org/wiki/Adams_(constructor)       |Adams           |American               |
|1950  |alfa          |http://en.wikipedia.org/wiki/Alfa_Romeo_in_Formula_One |Alfa Romeo      |Swiss                  |
|1950  |alta          |http://en.wikipedia.org/wiki/Alta_auto_racing_team     |Alta            |British                |
|1950  |cooper        |http://en.wikipedia.org/wiki/Cooper_Car_Company        |Cooper          |British                |
|1950  |deidt         |http://en.wikipedia.org/wiki/Deidt                     |Deidt           |American               |
|1950  |era           |http://en

In [8]:
# Select the final columns
constructors_final_df = constructors_cast_df.drop(col("constructor_url"))
constructors_final_df.show(truncate=False)

+------+--------------+----------------+-----------------------+
|season|constructor_id|constructor_name|constructor_nationality|
+------+--------------+----------------+-----------------------+
|1950  |adams         |Adams           |American               |
|1950  |alfa          |Alfa Romeo      |Swiss                  |
|1950  |alta          |Alta            |British                |
|1950  |cooper        |Cooper          |British                |
|1950  |deidt         |Deidt           |American               |
|1950  |era           |ERA             |British                |
|1950  |ewing         |Ewing           |American               |
|1950  |ferrari       |Ferrari         |Italian                |
|1950  |kurtis_kraft  |Kurtis Kraft    |American               |
|1950  |lago          |Talbot-Lago     |French                 |
|1950  |langley       |Langley         |American               |
|1950  |lesovsky      |Lesovsky        |American               |
|1950  |marchese      |Ma

In [9]:
constructors_final_df.printSchema()

root
 |-- season: integer (nullable = true)
 |-- constructor_id: string (nullable = true)
 |-- constructor_name: string (nullable = true)
 |-- constructor_nationality: string (nullable = true)



In [10]:
constructors_final_df.write.mode("overwrite").format("parquet").save("gs://f1-gcp/processed/constructors/")

In [11]:
spark.read.parquet("gs://f1-gcp/processed/constructors/").show()

+------+--------------+----------------+-----------------------+
|season|constructor_id|constructor_name|constructor_nationality|
+------+--------------+----------------+-----------------------+
|  1950|         adams|           Adams|               American|
|  1950|          alfa|      Alfa Romeo|                  Swiss|
|  1950|          alta|            Alta|                British|
|  1950|        cooper|          Cooper|                British|
|  1950|         deidt|           Deidt|               American|
|  1950|           era|             ERA|                British|
|  1950|         ewing|           Ewing|               American|
|  1950|       ferrari|         Ferrari|                Italian|
|  1950|  kurtis_kraft|    Kurtis Kraft|               American|
|  1950|          lago|     Talbot-Lago|                 French|
|  1950|       langley|         Langley|               American|
|  1950|      lesovsky|        Lesovsky|               American|
|  1950|      marchese|  

In [12]:
spark.stop()