# Ingest circuits.csv file

#### Step 1 - Read the CSV file using the spark dataframe reader

In [None]:
# Import the types we want to use
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType



In [None]:
# Specify schema
circuits_schema = StructType(fields=[
    StructField("circuitId", IntegerType(), False),
    StructField("circuitRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("country", StringType(), True),
    StructField("lat", DoubleType(), True),
    StructField("lng", DoubleType(), True),
    StructField("alt", IntegerType(), True),
    StructField("url", StringType(), True),
    ])



In [None]:
# Read in the data
circuits_df = spark.read.option("header", True).schema(circuits_schema).csv('/mnt/formula1lgdl/raw/circuits.csv')
# .option("inferSchema", True) \ # Not best practice to use this in prod, only suitable for dev. 





In [None]:
# Select the required columns using df.select
circuits_selected_df = circuits_df.select("circuitId", "circuitRef","name","location","country","lat","lng","alt")




In [None]:
# You can also use the col function
from pyspark.sql.functions import col

circuits_selected_df = circuits_df.select(
    col("circuitId"),
    col("circuitRef"),
    col("name"),
    col("location"),
    col("country"),
    col("lat"),
    col("lng"),
    col("alt"),
)
"""
With this method, you can apply further methods to a col, such as :
col("lat").alias("latitudee"),
"""



#### Step 3 - Rename the columns as required

In [None]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude")



#### Step 4 - Add ingestion date to the dataframe

In [None]:
from pyspark.sql.functions import current_timestamp, lit



In [None]:
# use the .withColumn() to add a column
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())



#### Step 5 - Write to datalake as parquet

In [None]:
circuits_final_df.write.mode("overwrite").parquet("mnt/formula1lgdl/processed/circuits")

