# Ingest races.csv file

#### Step 1 - Read the CSV file using the spark dataframe reader

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from pyspark.sql.functions import col, to_timestamp, concat, lit, current_timestamp

In [None]:
display(dbutils.fs.ls("/mnt/formula1lgdl/raw"))

In [None]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True),
])

In [None]:
races_df = spark.read.option("header", True).schema(races_schema).csv("/mnt/formula1lgdl/raw/races.csv")

#### Step 2 - Rename the columns as required

In [None]:
races_df_renamed = races_df_selected.withColumnRenamed("raceId","race_id") \
.withColumnRenamed("year","race_year") \
.withColumnRenamed("circuitId","circuit_id")

#### Step 3 - Add ingestion date to the dataframe

In [None]:
races_df_renamed = races_df_renamed.withColumn(
    "ingestion_date",
    current_timestamp()
)

#### Step 4 - Create the race_timestamp column

In [None]:
races_df_renamed = races_df_renamed.withColumn(
    "race_timestamp",
    to_timestamp(concat(col('date'), lit(' '), col('time')),
                 'yyyy-MM-dd HH:mm:ss'
                 )
)

#### Step 5  - Select the required columns

In [None]:
races_df_final = races_df_renamed.select(
    col("race_id"),
    col("race_year"),
    col("round"),
    col("circuit_id"),
    col("name"),
    col("ingestion_date"),
    col("race_timestamp"),
)

#### Step 6 - Write to datalake as parquet

In [None]:
races_df_final.write.mode("overwrite").partitionBy('race_year').parquet("mnt/formula1lgdl/processed/races")