In [None]:
from pyspark.sql import SparkSession

def fetch_f1_bronze_layer_incremental(spark, bronze_path="/content/drive/MyDrive/Capstone/bronze/",
                                     checkpoint_path="/content/drive/MyDrive/Capstone/checkpoint/bronze"):
    """
    Build the bronze layer for F1 data with incremental loading support.
    Stores data in race-per-row format with proper partitioning.
    """
    from datetime import datetime
    from pyspark.sql.functions import lit, current_timestamp
    import json
    import time
    import requests

    # Checkpoint logic remains the same
    current_year = datetime.now().year  # Get once at start
    try:
        checkpoint_df = spark.read.parquet(checkpoint_path)
        last_checkpoint = checkpoint_df.orderBy("timestamp", ascending=False).first()
        last_processed_season = last_checkpoint.last_processed_season
        print(f"Last processed season from checkpoint: {last_processed_season}")
        seasons_to_process = list(range(last_processed_season + 1, current_year + 1))
    except:
        seasons_to_process = list(range(1950, current_year + 1))
        print("No checkpoint found - processing all seasons from 1950")

    # Force include current season (even if already processed)
    if current_year not in seasons_to_process:
        seasons_to_process.append(current_year)
        seasons_to_process = sorted(seasons_to_process)
        print(f"Added current season {current_year} to processing list")

    print(f"Processing seasons: {seasons_to_process}")

    # Rate limiter implementation remains the same
    class RateLimiter:
        def __init__(self, burst_limit=4, hourly_limit=500):
            self.burst_limit = burst_limit
            self.hourly_limit = hourly_limit
            self.request_timestamps = []

        def wait_if_needed(self):
            current_time = time.time()
            self.request_timestamps = [ts for ts in self.request_timestamps if current_time - ts < 3600]

            if len(self.request_timestamps) >= self.hourly_limit:
                oldest = min(self.request_timestamps)
                sleep = 3600 - (current_time - oldest) + 1
                print(f"Hourly limit reached. Sleeping {sleep:.1f}s")
                time.sleep(sleep)
                return self.wait_if_needed()

            recent = [ts for ts in self.request_timestamps if current_time - ts < 1]
            if len(recent) >= self.burst_limit:
                time.sleep(1)

            self.request_timestamps.append(current_time)

    rate_limiter = RateLimiter()

    def make_api_request(url, params, retries=3):
        for attempt in range(retries):
            rate_limiter.wait_if_needed()
            try:
                response = requests.get(url, params=params)
                if response.status_code == 200:
                    return response.json()
                elif response.status_code == 429:
                    wait = min(30, (2 ** attempt) + 1)
                    print(f"Rate limited. Waiting {wait}s")
                    time.sleep(wait)
                else:
                    print(f"HTTP {response.status_code}. Retry {attempt+1}/{retries}")
                    time.sleep(1)
            except Exception as e:
                print(f"Error: {str(e)}. Retry {attempt+1}/{retries}")
                time.sleep(1)
        return None

    def fetch_season_races(season):
        """Fetch and enrich races for a season"""
        print(f"Fetching season {season}")
        url = f"https://api.jolpi.ca/ergast/f1/{season}/results/"

        # Get initial count
        initial = make_api_request(url, {"limit": 1, "offset": 0})
        if not initial or "MRData" not in initial:
            return None

        total = int(initial["MRData"]["total"])
        limit = 100
        all_races = []

        for offset in range(0, total, limit):
            result = make_api_request(url, {"limit": limit, "offset": offset})
            if result and "MRData" in result and "RaceTable" in result["MRData"]:
                all_races.extend(result["MRData"]["RaceTable"].get("Races", []))

        # Add metadata to each race
        enriched = []
        ingestion_ts = datetime.now().isoformat()
        for race in all_races:
            race["season"] = season
            race["source"] = "jolpica_api"
            race["ingestion_timestamp"] = ingestion_ts
            enriched.append(race)

        return enriched

    # Process seasons
    for season in seasons_to_process:
        print(f"\nProcessing season {season}")
        season_dir = f"{bronze_path}/season={season}"

        # Handle current season differently
        if season == current_year:
            print(f"Force refreshing current season {season}")

            # Delete existing directory if it exists
            try:
                hadoop_path = spark._jvm.org.apache.hadoop.fs.Path(season_dir)
                fs = hadoop_path.getFileSystem(spark._jsc.hadoopConfiguration())
                if fs.exists(hadoop_path):
                    print(f"Deleting existing data for season {season}")
                    fs.delete(hadoop_path, True)  # recursive delete
            except Exception as e:
                print(f"Error deleting directory: {str(e)}")
                pass
        else:
            # Skip existing non-current seasons
            try:
                hadoop_path = spark._jvm.org.apache.hadoop.fs.Path(season_dir)
                fs = hadoop_path.getFileSystem(spark._jsc.hadoopConfiguration())
                if fs.exists(hadoop_path):
                    print(f"Skipping existing non-current season {season}")
                    continue
            except:
                pass

        # Rest of processing logic remains unchanged...
        # Fetch data
        races = fetch_season_races(season)
        if not races:
            print(f"No data for season {season}")
            continue

        # Write using Spark
        sc = spark.sparkContext
        rdd = sc.parallelize([json.dumps(race) for race in races])
        rdd.coalesce(1).saveAsTextFile(season_dir)  # Now safe to write

        # Update checkpoint
        checkpoint_data = [{
            "timestamp": datetime.now().isoformat(),
            "last_processed_season": season,
            "records_processed": len(races)
        }]
        spark.createDataFrame(checkpoint_data).write.mode("append").parquet(checkpoint_path)

        print(f"Saved {len(races)} races for season {season}")

    print("Bronze layer update complete")
    return seasons_to_process

spark = SparkSession.builder.appName("F1MedallionPipeline").getOrCreate()
fetch_f1_bronze_layer_incremental(
        spark)

Last processed season from checkpoint: 2025
Added current season 2025 to processing list
Processing seasons: [2025]

Processing season 2025
Force refreshing current season 2025
Deleting existing data for season 2025
Fetching season 2025
Saved 4 races for season 2025
Bronze layer update complete


[2025]

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime
import json
import hashlib
import uuid

class F1SilverLayer:
    def __init__(self,
                 spark,
                 bronze_path="/content/drive/MyDrive/Capstone/bronze/",
                 silver_path="/content/drive/MyDrive/Capstone/silver/",
                 silver_checkpoint_path="/content/drive/MyDrive/Capstone/checkpoint/silver"):
        """
        Initialize the F1 Silver Layer processor
        """
        self.spark = spark
        self.bronze_path = bronze_path
        self.silver_path = silver_path
        self.silver_checkpoint_path = silver_checkpoint_path

        # Configure Spark for optimal performance
        self.spark.conf.set("spark.sql.parquet.compression.codec", "zstd")
        self.spark.conf.set("spark.sql.parquet.enableDictionaryEncoding", "true")
        self.spark.conf.set("spark.sql.parquet.block.size", 256*1024*1024)

        # Define schema for better control and performance
        self.define_schemas()

    def define_schemas(self):
        """Define explicit schemas for the silver layer"""
        self.circuit_schema = T.StructType([
            T.StructField("circuitId", T.StringType(), True),
            T.StructField("circuitName", T.StringType(), True),
            T.StructField("lat", T.DoubleType(), True),
            T.StructField("long", T.DoubleType(), True),
            T.StructField("locality", T.StringType(), True),
            T.StructField("country", T.StringType(), True)
        ])

        self.result_schema = T.StructType([
            T.StructField("constructorId", T.StringType(), True),
            T.StructField("constructorName", T.StringType(), True),
            T.StructField("driverId", T.StringType(), True),
            T.StructField("driverName", T.StringType(), True),
            T.StructField("position", T.IntegerType(), True),
            T.StructField("points", T.DoubleType(), True),
            T.StructField("grid", T.IntegerType(), True),
            T.StructField("laps", T.IntegerType(), True),
            T.StructField("status", T.StringType(), True),
            T.StructField("time", T.StringType(), True)
        ])

    def get_last_processed_timestamp(self):
        """Get the last processed timestamp from checkpoint"""
        try:
            checkpoint_df = self.spark.read.parquet(self.silver_checkpoint_path)
            last_checkpoint = checkpoint_df.orderBy(F.col("processed_timestamp").desc()).first()
            return last_checkpoint.processed_timestamp
        except:
            return "1900-01-01T00:00:00"

    def process_bronze_data(self):
        """Process bronze data incrementally"""
        # Read bronze data
        bronze_df = self.spark.read.json(f"{self.bronze_path}/season=*")

        # Get last processed timestamp
        last_processed = self.get_last_processed_timestamp()

        # Filter for new records
        incremental_df = bronze_df.filter(
            F.col("ingestion_timestamp") > last_processed
        )

        if incremental_df.count() == 0:
            print("No new data to process")
            return None

        return self.transform_bronze_to_silver(incremental_df)

    def transform_bronze_to_silver(self, df):
        """Transform bronze data into silver format with quality checks"""
        # 1. Explode nested structures
        df = df.select(
            F.col("season"),
            F.col("round"),
            F.col("raceName"),
            F.col("date"),
            F.col("time"),
            F.to_timestamp(F.col("ingestion_timestamp")).alias("ingestion_timestamp"),
            F.col("url"),
            F.col("Circuit").alias("circuit"),
            F.explode("Results").alias("result")
        )

        # 2. Flatten nested structures with explicit casting
        df = df.select(
            "*",
            F.col("circuit.circuitId").alias("circuit_id"),
            F.col("circuit.circuitName").alias("circuit_name"),
            F.col("circuit.Location.lat").cast("double").alias("circuit_lat"),
            F.col("circuit.Location.long").cast("double").alias("circuit_long"),
            F.col("circuit.Location.locality").alias("circuit_locality"),
            F.col("circuit.Location.country").alias("circuit_country"),
            F.col("result.Constructor.constructorId").alias("constructor_id"),
            F.col("result.Constructor.name").alias("constructor_name"),
            F.col("result.Driver.driverId").alias("driver_id"),
            F.col("result.Driver.givenName").alias("driver_given_name"),
            F.col("result.Driver.familyName").alias("driver_family_name"),
            F.col("result.position").cast("integer").alias("position"),
            F.col("result.points").cast("double").alias("points"),
            F.col("result.grid").cast("integer").alias("grid"),
            F.col("result.laps").cast("integer").alias("laps"),
            F.col("result.status").alias("status"),
            F.col("result.Time.time").alias("finish_time")
        ).drop("circuit", "result")

        # 3. Data type conversions and standardization
        df = df.withColumn("race_timestamp",
                          F.to_timestamp(
                              F.concat(F.col("date"), F.lit(" "), F.col("time")),
                              "yyyy-MM-dd HH:mm:ssX"
                          ))

        # 4. Add computed columns
        df = df.withColumn("driver_full_name",
                          F.concat(F.col("driver_given_name"), F.lit(" "), F.col("driver_family_name")))

        # 5. Generate hash key for change detection
        columns_for_hash = ["season", "round", "driver_id", "constructor_id", "position"]
        df = df.withColumn("row_hash",
                          F.sha2(F.concat_ws("|", *[F.col(c) for c in columns_for_hash]), 256))

        # 6. Add data quality columns
        df = self.add_data_quality_checks(df)

        # 7. Add metadata columns
        df = df.withColumn("processed_timestamp", F.current_timestamp())
        df = df.withColumn("silver_batch_id", F.lit(str(uuid.uuid4())))  # Using Python UUID instead of Spark UUID

        return df

    def add_data_quality_checks(self, df):
        """Add data quality check columns"""
        # Define quality checks
        df = df.withColumn("is_valid_position",
                          (F.col("position").isNotNull() & (F.col("position") >= 1)))

        df = df.withColumn("is_valid_points",
                          (F.col("points").isNotNull() & (F.col("points") >= 0)))

        df = df.withColumn("is_valid_grid",
                          (F.col("grid").isNotNull() & (F.col("grid") >= 0)))

        df = df.withColumn("is_valid_date",
                          F.col("race_timestamp").isNotNull())

        # Combine all checks
        df = df.withColumn("is_valid_record",
                          F.col("is_valid_position") &
                          F.col("is_valid_points") &
                          F.col("is_valid_grid") &
                          F.col("is_valid_date"))

        # Calculate null percentages
        for column in df.columns:
            df = df.withColumn(f"is_null_{column}",
                             F.when(F.col(column).isNull(), 1).otherwise(0))

        return df

    def write_to_silver(self, df):
        """Write processed data to silver layer"""
        if df is None or df.count() == 0:
            return

        # Write main silver table
        (df.write
         .mode("append")
         .partitionBy("season")
         .format("parquet")
         .option("compression", "zstd")
         .save(self.silver_path))

        # Write quality metrics
        quality_metrics = self.calculate_quality_metrics(df)
        (quality_metrics.write
         .mode("append")
         .format("parquet")
         .save(f"{self.silver_path}_metrics"))

        # Update checkpoint
        self.update_checkpoint(df)

    def calculate_quality_metrics(self, df):
        """Calculate quality metrics for the batch"""
        metrics = []

        # Calculate null percentages
        for column in df.columns:
            null_count = df.filter(F.col(column).isNull()).count()
            total_count = df.count()
            null_percentage = (null_count / total_count) * 100 if total_count > 0 else 0
            metrics.append({
                "metric_name": f"null_percentage_{column}",
                "metric_value": null_percentage,
                "batch_id": df.select(F.first("silver_batch_id")).collect()[0][0],
                "calculated_at": datetime.now().isoformat()
            })

        # Add other metrics
        metrics.extend([
            {
                "metric_name": "total_records",
                "metric_value": df.count(),
                "batch_id": df.select(F.first("silver_batch_id")).collect()[0][0],
                "calculated_at": datetime.now().isoformat()
            },
            {
                "metric_name": "invalid_records_percentage",
                "metric_value": (df.filter(~F.col("is_valid_record")).count() / df.count()) * 100,
                "batch_id": df.select(F.first("silver_batch_id")).collect()[0][0],
                "calculated_at": datetime.now().isoformat()
            }
        ])

        return self.spark.createDataFrame(metrics)

    def update_checkpoint(self, df):
        """Update the checkpoint with latest processed timestamp"""
        checkpoint_data = [{
            "processed_timestamp": df.select(F.max("ingestion_timestamp")).collect()[0][0],
            "silver_batch_id": df.select(F.first("silver_batch_id")).collect()[0][0],
            "record_count": df.count(),
            "checkpoint_timestamp": datetime.now().isoformat()
        }]

        (self.spark.createDataFrame(checkpoint_data)
         .write
         .mode("append")
         .parquet(self.silver_checkpoint_path))

    def process(self):
        """Main processing method"""
        try:
            print("Starting Silver Layer Processing...")

            # Process bronze to silver
            silver_df = self.process_bronze_data()

            # Write to silver if we have data
            if silver_df is not None:
                print(f"Writing {silver_df.count()} records to silver layer...")
                self.write_to_silver(silver_df)
                print("Silver Layer Processing Complete!")
            else:
                print("No new data to process")

        except Exception as e:
            print(f"Error in Silver Layer Processing: {str(e)}")
            raise

# Usage
if __name__ == "__main__":
    spark = SparkSession.builder \
        .appName("F1SilverLayer") \
        .getOrCreate()

    # Optional: Mount Google Drive if using Colab with Drive storage
    # from google.colab import drive
    # drive.mount('/content/drive')

    silver_layer = F1SilverLayer(spark)
    silver_layer.process()

Starting Silver Layer Processing...
Writing 25474 records to silver layer...
Error in Silver Layer Processing: [CANNOT_MERGE_TYPE] Can not merge type `DoubleType` and `LongType`.


PySparkTypeError: [CANNOT_MERGE_TYPE] Can not merge type `DoubleType` and `LongType`.

In [None]:
df = spark.read.json('/content/sample_data/bronze/season=*')
df.show(n=5)
df.write.mode("overwrite").parquet('/content/sample_data/silver/')

In [None]:
# Check if the bronze data exists
!ls -l /content/sample_data/bronze/

In [None]:
!mkdir -p /content/sample_data/silver/
!mkdir -p /content/sample_data/silver_checkpoint/

In [None]:
bronze_df = spark.read.json('/content/sample_data/bronze/season=*')


In [None]:
spark = SparkSession.builder.appName("F1MedallionPipeline").getOrCreate()


In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime
import json

# Initialize Spark Session (without Delta configurations)
spark = SparkSession.builder \
    .appName("F1SilverLayer") \
    .getOrCreate()

# First, let's read the parquet files and see what we're working with
def process_f1_silver_layer():
    try:
        print("Starting Silver Layer Processing...")

        # Read the parquet files
        bronze_df = spark.read.json(
            "/content/drive/MyDrive/Capstone/bronze/season=*"
        )

        print("Bronze data loaded. Starting transformations...")

        # 1. Explode nested structures
        df = bronze_df.select(
            F.col("season"),
            F.col("round"),
            F.col("raceName"),
            F.col("date"),
            F.col("time"),
            F.col("url"),
            F.col("Circuit").alias("circuit"),
            F.explode("Results").alias("result")
        )

        # 2. Flatten nested structures
        df = df.select(
            "*",
            F.col("circuit.circuitId").alias("circuit_id"),
            F.col("circuit.circuitName").alias("circuit_name"),
            F.col("circuit.Location.lat").alias("circuit_lat"),
            F.col("circuit.Location.long").alias("circuit_long"),
            F.col("circuit.Location.locality").alias("circuit_locality"),
            F.col("circuit.Location.country").alias("circuit_country"),
            F.col("result.Constructor.constructorId").alias("constructor_id"),
            F.col("result.Constructor.name").alias("constructor_name"),
            F.col("result.Driver.driverId").alias("driver_id"),
            F.col("result.Driver.givenName").alias("driver_given_name"),
            F.col("result.Driver.familyName").alias("driver_family_name"),
            F.col("result.position").alias("position"),
            F.col("result.points").alias("points"),
            F.col("result.grid").alias("grid"),
            F.col("result.laps").alias("laps"),
            F.col("result.status").alias("status"),
            F.col("result.Time.time").alias("finish_time")
        ).drop("circuit", "result")

        # 3. Data type conversions and standardization
        df = df.withColumn("race_timestamp",
                          F.to_timestamp(
                              F.concat(F.col("date"), F.lit(" "), F.col("time")),
                              "yyyy-MM-dd HH:mm:ssX"
                          ))

        # 4. Add computed columns
        df = df.withColumn("driver_full_name",
                          F.concat(F.col("driver_given_name"),
                                 F.lit(" "),
                                 F.col("driver_family_name")))

        # 5. Data quality checks
        df = df.withColumn("is_valid_position",
                          (F.col("position").isNotNull() &
                           F.col("position").cast("int").isNotNull() &
                           (F.col("position").cast("int") >= 1)))

        df = df.withColumn("is_valid_points",
                          (F.col("points").isNotNull() &
                           F.col("points").cast("double").isNotNull() &
                           (F.col("points").cast("double") >= 0)))

        df = df.withColumn("is_valid_grid",
                          (F.col("grid").isNotNull() &
                           F.col("grid").cast("int").isNotNull() &
                           (F.col("grid").cast("int") >= 0)))

        df = df.withColumn("is_valid_date",
                          F.col("race_timestamp").isNotNull())

        # Combine all checks
        df = df.withColumn("is_valid_record",
                          F.col("is_valid_position") &
                          F.col("is_valid_points") &
                          F.col("is_valid_grid") &
                          F.col("is_valid_date"))

        # 6. Add metadata
        df = df.withColumn("processed_timestamp", F.current_timestamp())

        # 7. Calculate quality metrics
        total_records = df.count()
        valid_records = df.filter(F.col("is_valid_record")).count()
        null_percentages = {}

        for column in df.columns:
            null_count = df.filter(F.col(column).isNull()).count()
            null_percentages[column] = (null_count / total_records) * 100

        print("\nData Quality Metrics:")
        print(f"Total Records: {total_records}")
        print(f"Valid Records: {valid_records}")
        print(f"Invalid Records: {total_records - valid_records}")
        print("\nNull Percentages:")
        for col, pct in null_percentages.items():
            if pct > 0:
                print(f"{col}: {pct:.2f}%")

        # 8. Write to silver layer
        print("\nWriting to silver layer...")

        # Write main dataset
        silver_path = "/content/drive/MyDrive/Capstone/silver1"
        df.write.mode("overwrite") \
            .partitionBy("season") \
            .parquet(silver_path)

        print(f"\nSilver layer data written to: {silver_path}")
        print("Silver Layer Processing Complete!")

        return df

    except Exception as e:
        print(f"Error in Silver Layer Processing: {str(e)}")
        raise

# Execute the processing
silver_df = process_f1_silver_layer()

# Show sample of the processed data
print("\nSample of processed data:")
silver_df.show(5)

Starting Silver Layer Processing...
Bronze data loaded. Starting transformations...

Data Quality Metrics:
Total Records: 25474
Valid Records: 8366
Invalid Records: 17108

Null Percentages:
time: 67.16%
finish_time: 68.71%
race_timestamp: 67.16%

Writing to silver layer...

Silver layer data written to: /content/drive/MyDrive/Capstone/silver1
Silver Layer Processing Complete!

Sample of processed data:
+------+-----+------------------+----------+---------+--------------------+----------+--------------------+-----------+------------+----------------+---------------+--------------+----------------+--------------+-----------------+------------------+--------+------+----+----+--------+-----------+-------------------+----------------+-----------------+---------------+-------------+-------------+---------------+--------------------+
|season|round|          raceName|      date|     time|                 url|circuit_id|        circuit_name|circuit_lat|circuit_long|circuit_locality|circuit_coun

In [None]:
df = spark.read.parquet('/content/drive/MyDrive/Capstone/silver/season=*')
df.show(n=5)

Py4JJavaError: An error occurred while calling o275.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 109.0 failed 1 times, most recent failure: Lost task 0.0 in stage 109.0 (TID 517) (9d9453dfbdfb executor driver): org.apache.spark.SparkException: Parquet column cannot be converted in file file:///content/drive/MyDrive/Capstone/silver/season=2024/part-00000-b8276d09-4dc2-4133-872b-4ba68e15b3eb.c000.zstd.parquet. Column: [circuit_lat], Expected: string, Found: DOUBLE.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:855)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:290)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:131)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException: column: [circuit_lat], physicalType: DOUBLE, logicalType: string
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1136)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:199)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:342)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:233)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:131)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:286)
	... 23 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4333)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4323)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4321)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4321)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3316)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3539)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Parquet column cannot be converted in file file:///content/drive/MyDrive/Capstone/silver/season=2024/part-00000-b8276d09-4dc2-4133-872b-4ba68e15b3eb.c000.zstd.parquet. Column: [circuit_lat], Expected: string, Found: DOUBLE.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:855)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:290)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:131)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException: column: [circuit_lat], physicalType: DOUBLE, logicalType: string
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1136)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:199)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:342)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:233)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:131)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:286)
	... 23 more


In [None]:
df.repartition(1).write.mode("overwrite").csv('/content/sample_data/gold/')

In [None]:
from pyspark.sql import SparkSession, Window
from pyspark.sql import functions as F
from pyspark.sql import types as T
from datetime import datetime
import os



# Process F1 Silver Layer
def process_f1_silver_layer():
    try:
        print("Starting Silver Layer Processing...")

        # Read the parquet file directly
        file_path = "/content/drive/MyDrive/Capstone/bronze/season=*"

        """# First read with pandas to verify we can access the file
        print("Reading file with pandas first to verify access...")
        pdf = pd.read_parquet(file_path)
        print(f"Successfully read with pandas: {len(pdf)} rows")"""

        # Now read with Spark
        bronze_df = spark.read.parquet(file_path)

        print(f"Bronze data loaded with {bronze_df.count()} rows. Starting transformations...")

        # Display schema to understand the data
        print("Bronze data schema:")
        bronze_df.printSchema()

        # 1. Explode nested structures
        df = bronze_df.select(
            F.col("season"),
            F.col("round"),
            F.col("raceName"),
            F.col("date"),
            F.col("time"),
            F.col("url"),
            F.col("Circuit").alias("circuit"),
            F.explode("Results").alias("result")
        )

        # 2. Flatten nested structures
        df = df.select(
            "*",
            F.col("circuit.circuitId").alias("circuit_id"),
            F.col("circuit.circuitName").alias("circuit_name"),
            F.col("circuit.Location.lat").alias("circuit_lat"),
            F.col("circuit.Location.long").alias("circuit_long"),
            F.col("circuit.Location.locality").alias("circuit_locality"),
            F.col("circuit.Location.country").alias("circuit_country"),
            F.col("result.Constructor.constructorId").alias("constructor_id"),
            F.col("result.Constructor.name").alias("constructor_name"),
            F.col("result.Constructor.nationality").alias("constructor_nationality"),
            F.col("result.Driver.driverId").alias("driver_id"),
            F.col("result.Driver.givenName").alias("driver_given_name"),
            F.col("result.Driver.familyName").alias("driver_family_name"),
            F.col("result.Driver.nationality").alias("driver_nationality"),
            F.col("result.Driver.dateOfBirth").alias("driver_dob"),
            F.col("result.position").alias("position"),
            F.col("result.points").alias("points"),
            F.col("result.grid").alias("grid"),
            F.col("result.laps").alias("laps"),
            F.col("result.status").alias("status"),
            F.col("result.Time.time").alias("finish_time")
        ).drop("circuit", "result")

        # 3. Data type conversions and standardization
        # Handle missing time values
        df = df.withColumn("race_date", F.to_date(F.col("date"), "yyyy-MM-dd"))

        # Handle race_timestamp conditionally since time might be null
        df = df.withColumn("race_timestamp",
                          F.when(F.col("time").isNotNull(),
                                F.to_timestamp(F.concat(F.col("date"), F.lit(" "), F.col("time")),
                                              "yyyy-MM-dd HH:mm:ssX"))
                           .otherwise(F.to_timestamp(F.col("date"), "yyyy-MM-dd")))

        # Convert numeric columns to appropriate types
        df = df.withColumn("position", F.col("position").cast("integer"))
        df = df.withColumn("points", F.col("points").cast("double"))
        df = df.withColumn("grid", F.col("grid").cast("integer"))
        df = df.withColumn("laps", F.col("laps").cast("integer"))

        # 4. Add computed columns
        df = df.withColumn("driver_full_name",
                          F.concat(F.col("driver_given_name"),
                                 F.lit(" "),
                                 F.col("driver_family_name")))

        # 5. Data quality checks
        df = df.withColumn("is_valid_position",
                          (F.col("position").isNotNull() &
                           (F.col("position") >= 1)))

        df = df.withColumn("is_valid_points",
                          (F.col("points").isNotNull() &
                           (F.col("points") >= 0)))

        df = df.withColumn("is_valid_grid",
                          (F.col("grid").isNotNull() &
                           (F.col("grid") >= 0)))

        df = df.withColumn("is_valid_date",
                          F.col("race_date").isNotNull())

        # Combine all checks
        df = df.withColumn("is_valid_record",
                          F.col("is_valid_position") &
                          F.col("is_valid_points") &
                          F.col("is_valid_grid") &
                          F.col("is_valid_date"))

        # 6. Add metadata
        df = df.withColumn("processed_timestamp", F.current_timestamp())

        # 7. Calculate quality metrics
        total_records = df.count()
        valid_records = df.filter(F.col("is_valid_record")).count()
        null_percentages = {}

        for column in df.columns:
            null_count = df.filter(F.col(column).isNull()).count()
            null_percentages[column] = (null_count / total_records) * 100

        print("\nData Quality Metrics:")
        print(f"Total Records: {total_records}")
        print(f"Valid Records: {valid_records}")
        print(f"Invalid Records: {total_records - valid_records}")
        print("\nNull Percentages:")
        for col, pct in null_percentages.items():
            if pct > 0:
                print(f"{col}: {pct:.2f}%")

        # 8. Write to silver layer
        print("\nWriting to silver layer...")

        # Create output directory if it doesn't exist
        output_dir = "/content/drive/MyDrive/Capstone/silver1"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Write main dataset
        silver_path = output_dir
        df.write.mode("overwrite") \
            .partitionBy("season") \
            .parquet(silver_path)

        print(f"\nSilver layer data written to: {silver_path}")
        print("Silver Layer Processing Complete!")

        return df

    except Exception as e:
        print(f"Error in Silver Layer Processing: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

# Execute the processing
try:
    silver_df = process_f1_silver_layer()

    # Show sample of the processed data
    print("\nSample of processed data:")
    silver_df.show(5, truncate=False)

    # Additional analysis - Top 5 drivers by points
    print("\nTop 5 drivers by points:")
    silver_df.groupBy("driver_full_name") \
        .agg(F.sum("points").alias("total_points")) \
        .orderBy(F.col("total_points").desc()) \
        .show(5)
except Exception as e:
    print(f"Error in execution: {str(e)}")
    import traceback
    traceback.print_exc()

Starting Silver Layer Processing...
Error in Silver Layer Processing: An error occurred while calling o25.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 76) (9d9453dfbdfb executor driver): org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFo

Traceback (most recent call last):
  File "<ipython-input-3-09f84070b798>", line 23, in process_f1_silver_layer
    bronze_df = spark.read.parquet(file_path)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pyspark/sql/readwriter.py", line 544, in parquet
    return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
                   ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o25.parquet.
: org.apache.spark.SparkExceptio