# Ingest laptimes folder

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

#### Step 1 - Read the set of csv files

In [None]:
laptimes_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False ),
    StructField("driverId", IntegerType(), True ),
    StructField("lap", IntegerType(), True ),
    StructField("position", IntegerType(), True ),
    StructField("time", StringType(), True ),
    StructField("milliseconds", IntegerType(), True ),
])

laptimes_df = spark.read.schema(laptimes_schema).csv("/mnt/formula1lgdl/raw/lap_times") # specifying a folder here, not files
# You can also use wildcards here - i.e - lap_times_split*.csv

#### Step 2 - Rename columns and add new columns

In [None]:
final_df = laptimes_df.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumn("ingestion_date", current_timestamp())

#### Step 3 - Write to output to processes container in parquet format

In [None]:
final_df.write.mode("overwrite").parquet("/mnt/formula1lgdl/processed/lap_times")