##Ingest race.csv file

####Step 1 - Read CSV file using the spark dataframe reader

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
from pyspark.sql.functions import lit
from pyspark.sql import functions as F

####Set the Data Types

In [0]:
race_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                     StructField("year", IntegerType(), True),
                                     StructField("round", IntegerType(), True),
                                     StructField("circuitId", IntegerType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("date", DateType(), True),
                                     StructField("time", StringType(), True),
                                     StructField("url", StringType(), True)
                                     ])

In [0]:
race_df = spark.read \
.option("header",True) \
.schema(race_schema) \
.csv(f"{raw_folder_path}/races.csv")

In [0]:
display(race_df)

#### Step 2 - Add ingested date and race_timestamp to the dataframe

In [0]:
race_with_time_stamp_df = add_ingestion_date(race_df)\
                                  .withColumn("race_timestamp", F.to_timestamp(F.concat(F.col("date"), lit(" "), F.col("time"))))

In [0]:
display(race_with_time_stamp_df)

#### Step 3 - Rename columns

In [0]:
race_selected_df = race_with_time_stamp_df.select("raceId", "year", "round", "circuitId", "name", "ingestion_date", "race_timestamp")

In [0]:
race_renamed_df = race_selected_df.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("year", "race_year") \
.withColumnRenamed("circuitId", "circuit_id")

In [0]:
display(race_renamed_df)

#### Step 5 - Write into a Parquet File

In [0]:
race_renamed_df.write.mode("overwrite").partitionBy('race_year').parquet(f"{processed_folder_path}/races")

In [0]:
#display(spark.read.parquet(f"{processed_folder_path}/races"))