##Ingest circuits.csv file

####Step 1 - Read CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

####Set the Data Types

In [0]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
])

In [0]:
circuits_df = spark.read \
.option("header",True) \
.schema(circuits_schema) \
.csv("dbfs:/mnt/formula1courseds/raw/circuits.csv")

In [0]:
display(circuits_df)

#### Step 2 - Select only the required columns

In [0]:
circuits_selected_df = circuits_df.select(circuits_df["circuitId"], circuits_df["circuitRef"], circuits_df["name"], circuits_df["location"], circuits_df["country"], circuits_df["lat"], circuits_df["lng"], circuits_df["alt"])

In [0]:
display(circuits_selected_df)

#### Step 3 - Rename columns

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") 

In [0]:
display(circuits_renamed_df)

#### Step 4 - Rename columns

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
circuits_final_df = circuits_renamed_df.withColumn("ingestion_date",current_timestamp())

In [0]:
display(circuits_final_df)

#### Step 5 - Write into a Parquet File

In [0]:
circuits_final_df.write.mode("overwrite").parquet("/mnt/formula1courseds/processed/circuits")

In [0]:
#display(spark.read.parquet("/mnt/formula1courseds/processed/circuits"))