# Run helpers notebooks

In [0]:
%run "../helpers/configuration"

In [0]:
%run "../helpers/functions"

# Ingest and process raw data

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Define schema
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)])

In [0]:
# Read circuits.csv with defined schema
circuits_df = spark.read.csv(f"{raw_folder_path}/circuits.csv", header=True, schema=circuits_schema)
# display(circuits_df)

In [0]:
# Select all columns except url (can be done with select as well, but then you have to specify all columns)
from pyspark.sql.functions import col

circuits_filtered = circuits_df.drop(col("url"))
# display(circuits_filtered)

In [0]:
# Rename columns as needed

circuits_renamed = circuits_filtered.withColumnRenamed("circuitId", "circuit_id") \
    .withColumnRenamed("circuitRef", "circuit_ref") \
    .withColumnRenamed("lat", "latitude") \
    .withColumnRenamed("lng", "longitude") \
    .withColumnRenamed("alt", "altitude")
# display(circuits_renamed)

In [0]:
# Add ingestion date to df

circuits_final = add_ingestion_date(circuits_renamed)
# display(circuits_final)

In [0]:
# Write data to datalake as parquet

circuits_final.write.parquet(f"{processed_folder_path}/circuits", mode="overwrite")

In [0]:
display(spark.read.parquet("/mnt/formula1dataplatformdl/processed/circuits"))