In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [0]:
lap_times_schema = StructType(fields=[
    StructField('raceId', IntegerType(), False),
    StructField('driverId', IntegerType(), False),
    StructField('lap', IntegerType(), False),
    StructField('position', IntegerType(), True),
    StructField('time', StringType(), False),
    StructField('milliseconds', IntegerType(), True),
])

In [0]:
volume_dir = '/Volumes/formula1/default/f1_volume'

In [0]:
lap_times_raw_df = spark.read \
    .schema(lap_times_schema) \
    .csv(f'{volume_dir}/raw/lap_times')

In [0]:
display(lap_times_raw_df)

In [0]:
print(f'{lap_times_raw_df.count()=}')

In [0]:
%sql
CREATE TABLE lap_times AS
SELECT * FROM 
read_files('/Volumes/formula1/default/f1_volume/raw/lap_times',
format => 'csv',
header => 'false'
) ;

In [0]:
%sql
select count(*) from lap_times;

In [0]:
# %sql
# DROP TABLE lap_times;

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
lap_times_final_df = lap_times_raw_df.withColumnsRenamed({'driverId': 'driver_id', 'raceId': 'race_id'}) \
                                        .withColumn('ingestion_date', current_timestamp())

In [0]:
display(lap_times_final_df)

In [0]:
lap_times_final_df.write.mode('overwrite').parquet(f'{volume_dir}/processed/lap_times')

In [0]:
%fs
ls /Volumes/formula1/default/f1_volume/processed/lap_times