In [1]:
from pyspark.sql.functions import col, explode
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, ArrayType

In [2]:
spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "C:/Users/ACER SWIFT X/Desktop/Projects/F1/f1-de-gcp-7735b3a7392e.json")


In [3]:
# Define the schema for the nested "Location" field
constructors_schema = StructType(fields=[StructField("constructorId", StringType(), False),
                                     StructField("url", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("nationality", StringType(), True)
                                     ])

In [4]:
constructors_df = spark.read \
.schema(constructors_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/constructors.json")

In [5]:
constructors_df.printSchema()
constructors_df.show(truncate=False)

root
 |-- constructorId: string (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)

+--------------+-----------------------------------------------------------------+--------------+-------------+
|constructorId |url                                                              |name          |nationality  |
+--------------+-----------------------------------------------------------------+--------------+-------------+
|adams         |http://en.wikipedia.org/wiki/Adams_(constructor)                 |Adams         |American     |
|afm           |http://en.wikipedia.org/wiki/Alex_von_Falkenhausen_Motorenbau    |AFM           |German       |
|ags           |http://en.wikipedia.org/wiki/Automobiles_Gonfaronnaises_Sportives|AGS           |French       |
|alfa          |http://en.wikipedia.org/wiki/Alfa_Romeo_in_Formula_One           |Alfa Romeo    |Swiss        |
|alphatauri    |http://en.wikipedia.org/wiki/Scuder

In [7]:
constructors_final_df = constructors_df.drop(col("url"))\
                                       .withColumnRenamed("constructorId", "constructor_id") 

constructors_final_df.show(truncate=False)

+--------------+--------------+-------------+
|constructor_id|name          |nationality  |
+--------------+--------------+-------------+
|adams         |Adams         |American     |
|afm           |AFM           |German       |
|ags           |AGS           |French       |
|alfa          |Alfa Romeo    |Swiss        |
|alphatauri    |AlphaTauri    |Italian      |
|alpine        |Alpine F1 Team|French       |
|alta          |Alta          |British      |
|amon          |Amon          |New Zealander|
|apollon       |Apollon       |Swiss        |
|arrows        |Arrows        |British      |
|arzani-volpini|Arzani-Volpini|Italian      |
|aston_martin  |Aston Martin  |British      |
|ats           |ATS           |Italian      |
|bar           |BAR           |British      |
|behra-porsche |Behra-Porsche |Italian      |
|bellasi       |Bellasi       |Swiss        |
|benetton      |Benetton      |Italian      |
|bmw           |BMW           |German       |
|bmw_sauber    |BMW Sauber    |Ger

In [8]:
constructors_final_df.printSchema()

root
 |-- constructor_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)



In [9]:
constructors_final_df.write.mode("overwrite").format("parquet").save("gs://f1-gcp/processed/constructors/")

In [10]:
spark.read.parquet("gs://f1-gcp/processed/constructors/").show()

+--------------+--------------+-------------+
|constructor_id|          name|  nationality|
+--------------+--------------+-------------+
|         adams|         Adams|     American|
|           afm|           AFM|       German|
|           ags|           AGS|       French|
|          alfa|    Alfa Romeo|        Swiss|
|    alphatauri|    AlphaTauri|      Italian|
|        alpine|Alpine F1 Team|       French|
|          alta|          Alta|      British|
|          amon|          Amon|New Zealander|
|       apollon|       Apollon|        Swiss|
|        arrows|        Arrows|      British|
|arzani-volpini|Arzani-Volpini|      Italian|
|  aston_martin|  Aston Martin|      British|
|           ats|           ATS|      Italian|
|           bar|           BAR|      British|
| behra-porsche| Behra-Porsche|      Italian|
|       bellasi|       Bellasi|        Swiss|
|      benetton|      Benetton|      Italian|
|           bmw|           BMW|       German|
|    bmw_sauber|    BMW Sauber|   

In [11]:
spark.stop()