In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

spark = SparkSession.builder \
    .appName("F1 Data Pipeline") \
    .config("spark.ui.port", "4040") \
    .config("spark.jars", "file:///C:/spark/spark-3.5.1-bin-hadoop3/jars/gcs-connector-hadoop3-latest.jar") \
    .getOrCreate()

# Set GCS authentication and filesystem implementation
spark.conf.set("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark.conf.set("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark.conf.set("google.cloud.auth.service.account.enable", "true")
spark.conf.set("google.cloud.auth.service.account.json.keyfile", "C:/Users/ACER SWIFT X/Desktop/Projects/F1/f1-de-gcp-7735b3a7392e.json")


In [2]:
# Define the schema for the nested "Location" field
location_schema = StructType(fields=[StructField("lat", StringType(), True),
                                     StructField("long", StringType(), True),
                                     StructField("locality", StringType(), True),
                                     StructField("country", StringType(), True)
])

circuits_schema = StructType(fields=[StructField("circuitId", StringType(), False),
                                     StructField("circuitName", StringType(), True),
                                     StructField("Location", location_schema),  # Nested schema
                                     StructField("url", StringType(), True)

])

In [4]:
circuits_df = spark.read \
.schema(circuits_schema) \
.option("multiLine", True) \
.json("gs://f1-gcp/raw/circuits.json")
circuits_df.show()

+------------+--------------------+--------------------+--------------------+
|   circuitId|         circuitName|            Location|                 url|
+------------+--------------------+--------------------+--------------------+
|    adelaide|Adelaide Street C...|{-34.9272, 138.61...|https://en.wikipe...|
|    ain-diab|            Ain Diab|{33.5786, -7.6875...|https://en.wikipe...|
|     aintree|             Aintree|{53.4769, -2.9405...|https://en.wikipe...|
| albert_park|Albert Park Grand...|{-37.8497, 144.96...|https://en.wikipe...|
|    americas|Circuit of the Am...|{30.1328, -97.641...|https://en.wikipe...|
|  anderstorp|Scandinavian Raceway|{57.2652999999999...|https://en.wikipe...|
|        avus|                AVUS|{52.4806, 13.2514...|https://en.wikipe...|
|     bahrain|Bahrain Internati...|{26.0325, 50.5106...|https://en.wikipe...|
|        baku|   Baku City Circuit|{40.3725, 49.8533...|https://en.wikipe...|
|    boavista|Circuito da Boavista|{41.1705, -8.6732...|https://

In [5]:
# Cast the columns
circuits_cast_df = circuits_df.withColumn("latitude", col("Location.lat").cast(DoubleType())) \
    .withColumn("longitude", col("Location.long").cast(DoubleType())) \
    .withColumn("locality", col("Location.locality")) \
    .withColumn("country", col("Location.country")) \
    .drop("Location")
circuits_cast_df.show(truncate=False)

+------------+------------------------------+------------------------------------------------------------+------------------+---------+----------------+-----------+
|circuitId   |circuitName                   |url                                                         |latitude          |longitude|locality        |country    |
+------------+------------------------------+------------------------------------------------------------+------------------+---------+----------------+-----------+
|adelaide    |Adelaide Street Circuit       |https://en.wikipedia.org/wiki/Adelaide_Street_Circuit       |-34.9272          |138.617  |Adelaide        |Australia  |
|ain-diab    |Ain Diab                      |https://en.wikipedia.org/wiki/Ain-Diab_Circuit              |33.5786           |-7.6875  |Casablanca      |Morocco    |
|aintree     |Aintree                       |https://en.wikipedia.org/wiki/Aintree_Motor_Racing_Circuit  |53.4769           |-2.94056 |Liverpool       |UK         |
|albert_pa

In [6]:
# Rename columns
circuits_renamed_df = circuits_cast_df.withColumnRenamed("circuitId", "circuit_id")\
                                .withColumnRenamed("circuitName", "circuit_name")\
    
circuits_renamed_df.show(truncate=False)

+------------+------------------------------+------------------------------------------------------------+------------------+---------+----------------+-----------+
|circuit_id  |circuit_name                  |url                                                         |latitude          |longitude|locality        |country    |
+------------+------------------------------+------------------------------------------------------------+------------------+---------+----------------+-----------+
|adelaide    |Adelaide Street Circuit       |https://en.wikipedia.org/wiki/Adelaide_Street_Circuit       |-34.9272          |138.617  |Adelaide        |Australia  |
|ain-diab    |Ain Diab                      |https://en.wikipedia.org/wiki/Ain-Diab_Circuit              |33.5786           |-7.6875  |Casablanca      |Morocco    |
|aintree     |Aintree                       |https://en.wikipedia.org/wiki/Aintree_Motor_Racing_Circuit  |53.4769           |-2.94056 |Liverpool       |UK         |
|albert_pa

In [8]:
# Select the final columns
circuits_final_df = circuits_renamed_df.select(col("circuit_id"), col("circuit_name"), col("country"), col("locality"), col("latitude"), col("longitude"))
circuits_final_df.show(truncate=False)

+------------+------------------------------+-----------+----------------+------------------+---------+
|circuit_id  |circuit_name                  |country    |locality        |latitude          |longitude|
+------------+------------------------------+-----------+----------------+------------------+---------+
|adelaide    |Adelaide Street Circuit       |Australia  |Adelaide        |-34.9272          |138.617  |
|ain-diab    |Ain Diab                      |Morocco    |Casablanca      |33.5786           |-7.6875  |
|aintree     |Aintree                       |UK         |Liverpool       |53.4769           |-2.94056 |
|albert_park |Albert Park Grand Prix Circuit|Australia  |Melbourne       |-37.8497          |144.968  |
|americas    |Circuit of the Americas       |USA        |Austin          |30.1328           |-97.6411 |
|anderstorp  |Scandinavian Raceway          |Sweden     |Anderstorp      |57.265299999999975|13.6042  |
|avus        |AVUS                          |Germany    |Berlin 

In [9]:
circuits_final_df.printSchema()

root
 |-- circuit_id: string (nullable = true)
 |-- circuit_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- locality: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [10]:
circuits_final_df.write.mode("overwrite").parquet("gs://f1-gcp/processed/circuits/")

In [12]:
spark.read.parquet("gs://f1-gcp/processed/circuits/").show()

+------------+--------------------+-----------+----------------+------------------+---------+
|  circuit_id|        circuit_name|    country|        locality|          latitude|longitude|
+------------+--------------------+-----------+----------------+------------------+---------+
|    adelaide|Adelaide Street C...|  Australia|        Adelaide|          -34.9272|  138.617|
|    ain-diab|            Ain Diab|    Morocco|      Casablanca|           33.5786|  -7.6875|
|     aintree|             Aintree|         UK|       Liverpool|           53.4769| -2.94056|
| albert_park|Albert Park Grand...|  Australia|       Melbourne|          -37.8497|  144.968|
|    americas|Circuit of the Am...|        USA|          Austin|           30.1328| -97.6411|
|  anderstorp|Scandinavian Raceway|     Sweden|      Anderstorp|57.265299999999975|  13.6042|
|        avus|                AVUS|    Germany|          Berlin|           52.4806|  13.2514|
|     bahrain|Bahrain Internati...|    Bahrain|          Sak

In [13]:
spark.stop()