In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime
import json

# Initialize Spark Session (without Delta configurations)
spark = SparkSession.builder \
    .appName("F1SilverLayer") \
    .getOrCreate()

# First, let's read the parquet files and see what we're working with
def process_f1_silver_layer():
    try:
        print("Starting Silver Layer Processing...")

        # Read the parquet files
        bronze_df = spark.read.json(
            "/content/drive/MyDrive/Capstone/bronze/season=*"
        )

        print("Bronze data loaded. Starting transformations...")

        # 1. Explode nested structures
        df = bronze_df.select(
            F.col("season"),
            F.col("round"),
            F.col("raceName"),
            F.col("date"),
            F.col("time"),
            F.col("url"),
            F.col("Circuit").alias("circuit"),
            F.explode("Results").alias("result")
        )

        # 2. Flatten nested structures
        df = df.select(
            "*",
            F.col("circuit.circuitId").alias("circuit_id"),
            F.col("circuit.circuitName").alias("circuit_name"),
            F.col("circuit.Location.lat").alias("circuit_lat"),
            F.col("circuit.Location.long").alias("circuit_long"),
            F.col("circuit.Location.locality").alias("circuit_locality"),
            F.col("circuit.Location.country").alias("circuit_country"),
            F.col("result.Constructor.constructorId").alias("constructor_id"),
            F.col("result.Constructor.name").alias("constructor_name"),
            F.col("result.Driver.driverId").alias("driver_id"),
            F.col("result.Driver.givenName").alias("driver_given_name"),
            F.col("result.Driver.familyName").alias("driver_family_name"),
            F.col("result.position").alias("position"),
            F.col("result.points").alias("points"),
            F.col("result.grid").alias("grid"),
            F.col("result.laps").alias("laps"),
            F.col("result.status").alias("status"),
            F.col("result.Time.time").alias("finish_time")
        ).drop("circuit", "result")

        # 3. Data type conversions and standardization
        df = df.withColumn("race_timestamp",
                          F.to_timestamp(
                              F.concat(F.col("date"), F.lit(" "), F.col("time")),
                              "yyyy-MM-dd HH:mm:ssX"
                          ))

        # 4. Add computed columns
        df = df.withColumn("driver_full_name",
                          F.concat(F.col("driver_given_name"),
                                 F.lit(" "),
                                 F.col("driver_family_name")))

        # 5. Data quality checks
        df = df.withColumn("is_valid_position",
                          (F.col("position").isNotNull() &
                           F.col("position").cast("int").isNotNull() &
                           (F.col("position").cast("int") >= 1)))

        df = df.withColumn("is_valid_points",
                          (F.col("points").isNotNull() &
                           F.col("points").cast("double").isNotNull() &
                           (F.col("points").cast("double") >= 0)))

        df = df.withColumn("is_valid_grid",
                          (F.col("grid").isNotNull() &
                           F.col("grid").cast("int").isNotNull() &
                           (F.col("grid").cast("int") >= 0)))

        df = df.withColumn("is_valid_date",
                          F.col("race_timestamp").isNotNull())

        # Combine all checks
        df = df.withColumn("is_valid_record",
                          F.col("is_valid_position") &
                          F.col("is_valid_points") &
                          F.col("is_valid_grid") &
                          F.col("is_valid_date"))

        # 6. Add metadata
        df = df.withColumn("processed_timestamp", F.current_timestamp())

        # 7. Calculate quality metrics
        total_records = df.count()
        valid_records = df.filter(F.col("is_valid_record")).count()
        null_percentages = {}

        for column in df.columns:
            null_count = df.filter(F.col(column).isNull()).count()
            null_percentages[column] = (null_count / total_records) * 100

        print("\nData Quality Metrics:")
        print(f"Total Records: {total_records}")
        print(f"Valid Records: {valid_records}")
        print(f"Invalid Records: {total_records - valid_records}")
        print("\nNull Percentages:")
        for col, pct in null_percentages.items():
            if pct > 0:
                print(f"{col}: {pct:.2f}%")

        # 8. Write to silver layer
        print("\nWriting to silver layer...")

        # Write main dataset
        silver_path = "/content/drive/MyDrive/Capstone/silver"
        df.write.mode("overwrite") \
            .partitionBy("season") \
            .parquet(silver_path)

        print(f"\nSilver layer data written to: {silver_path}")
        print("Silver Layer Processing Complete!")

        return df

    except Exception as e:
        print(f"Error in Silver Layer Processing: {str(e)}")
        raise

# Execute the processing
silver_df = process_f1_silver_layer()

# Show sample of the processed data
print("\nSample of processed data:")
silver_df.show(5)

Starting Silver Layer Processing...
Bronze data loaded. Starting transformations...

Data Quality Metrics:
Total Records: 25514
Valid Records: 8406
Invalid Records: 17108

Null Percentages:
time: 67.05%
finish_time: 68.62%
race_timestamp: 67.05%

Writing to silver layer...

Silver layer data written to: /content/drive/MyDrive/Capstone/silver
Silver Layer Processing Complete!

Sample of processed data:
+------+-----+------------------+----------+---------+--------------------+----------+--------------------+-----------+------------+----------------+---------------+--------------+----------------+--------------+-----------------+------------------+--------+------+----+----+--------+-----------+-------------------+----------------+-----------------+---------------+-------------+-------------+---------------+--------------------+
|season|round|          raceName|      date|     time|                 url|circuit_id|        circuit_name|circuit_lat|circuit_long|circuit_locality|circuit_count