In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession
            .builder
            .master('local[1]')
            .appName('F1-Racing-DE')
            .getOrCreate())

23/05/09 21:51:32 WARN Utils: Your hostname, MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.6 instead (on interface en0)
23/05/09 21:51:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/09 21:51:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Race Results

In [7]:
from pyspark.sql.functions import col, current_timestamp

In [3]:
races_df = spark.read.format('parquet').load('data/processed/races')
circuits_df = spark.read.format('parquet').load('data/processed/circuits')
drivers_df = spark.read.format('parquet').load('data/processed/drivers')
constructors_df = spark.read.format('parquet').load('data/processed/constructors')
results_df = spark.read.format('parquet').load('data/processed/results')

                                                                                

In [28]:
races_final_df = races_df.select(
    col('race_id'),
    col('circuit_id'),
    col('race_year'), 
    col('name').alias('race_name'), 
    col('race_timestamp').alias('race_date')
)

circuits_final_df = circuits_df.select(
    col('circuit_id'),
    col('location').alias('circuit_location')
)

drivers_final_df = drivers_df.select(
    col('driver_id'),
    col('number').alias('driver_number'),
    col('name').alias('driver_name'),
    col('nationality').alias('driver_nationality')
)

constructors_final_df = constructors_df.select(
    col('constructor_id'),
    col('name').alias('team')
)

results_final_df = results_df.select(
    col('result_id'),
    col('race_id'),
    col('driver_id'),
    col('constructor_id'),
    col('position'),
    col('grid'),
    col('fastest_lap'),
    col('points'),
    col('time').alias('race_time')
)

In [29]:
race_circuits_df = races_final_df.join(
    circuits_final_df,
    races_final_df.circuit_id == circuits_final_df.circuit_id,
    how='inner'
)

In [30]:
race_results_df = results_final_df \
    .join(
        race_circuits_df, results_final_df.race_id == race_circuits_df.race_id
    ) \
    .join(
        drivers_final_df, results_final_df.driver_id == drivers_final_df.driver_id
    ) \
    .join(
        constructors_final_df, results_final_df.constructor_id == constructors_final_df.constructor_id
    )


In [35]:
race_results_final_df = race_results_df.select(
    'race_year',
    'race_name',
    'race_date',
    'circuit_location',
    'driver_name',
    'driver_number',
    'driver_nationality',
    'team',
    'grid',
    'position',
    'fastest_lap',
    'race_time',
    'points'
)\
.withColumn('created_date', current_timestamp())

In [48]:
race_results_final_df.write.mode('overwrite').parquet('data/presentation/race_results')

                                                                                

### Driver Standings

In [54]:
from pyspark.sql.functions import sum, when, count, col

In [51]:
race_results_df = spark.read.format('parquet').load('data/presentation/race_results')

In [67]:
driver_standings_df = race_results_df \
    .groupBy('race_year', 'driver_name', 'driver_nationality', 'team') \
    .agg(
        sum('points').alias('total_points'),
        count(when(col('position') == 1, True)).alias('wins')
    )

In [77]:
driver_standings_df.write.mode('overwrite').parquet('data/presentation/driver_standings')

### Constructor Standings

In [78]:
from pyspark.sql.functions import sum, when, count, col

In [79]:
race_results_df = spark.read.format('parquet').load('data/presentation/race_results')

In [89]:
constructor_standings_df = race_results_df \
    .groupBy('race_year', 'team') \
    .agg(
        sum('points').alias('total_points'),
        count(when(col('position') == 1, True)).alias('wins')
    )

In [90]:
constructor_standings_df.write.mode('overwrite').parquet('data/presentation/constructor_standings')