# Ingest qualifying folder

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import current_timestamp

#### Step 1 - Read the set of csv files

In [None]:
qualifying_schema = StructType(fields=[
    StructField("qualifyId", IntegerType(), False ),
    StructField("raceId", IntegerType(), True ),
    StructField("driverId", IntegerType(), True ),
    StructField("constructorId", IntegerType(), True ),
    StructField("number", IntegerType(), True ),
    StructField("position", IntegerType(), True ),
    StructField("q1", StringType(), True ),
    StructField("q2", StringType(), True ),
    StructField("q3", StringType(), True ),
])

qualifying_df = spark.read.schema(qualifying_schema).option("multiline", True).json("/mnt/formula1lgdl/raw/qualifying") # specifying a folder here, not files

#### Step 2 - Rename columns and add new columns

In [None]:
final_df = qualifying_df.withColumnRenamed("raceId", "race_id") \
.withColumnRenamed("qualifyId", "qualify_id") \
.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("constructorId", "constructor_id") \
.withColumn("ingestion_date", current_timestamp())

#### Step 3 - Write to output to processes container in parquet format

In [None]:
final_df.write.mode("overwrite").parquet("/mnt/formula1lgdl/processed/qualifying")