In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pyspark.sql import SparkSession

def fetch_f1_bronze_layer_incremental(spark, bronze_path="/content/drive/MyDrive/Capstone/bronze/",
                                     checkpoint_path="/content/drive/MyDrive/Capstone/checkpoint/bronze"):
    """
    Build the bronze layer for F1 data with incremental loading support.
    Stores data in race-per-row format with proper partitioning.
    """
    from datetime import datetime
    from pyspark.sql.functions import lit, current_timestamp
    import json
    import time
    import requests

    # Checkpoint logic remains the same
    current_year = datetime.now().year  # Get once at start
    try:
        checkpoint_df = spark.read.parquet(checkpoint_path)
        last_checkpoint = checkpoint_df.orderBy("timestamp", ascending=False).first()
        last_processed_season = last_checkpoint.last_processed_season
        print(f"Last processed season from checkpoint: {last_processed_season}")
        seasons_to_process = list(range(last_processed_season + 1, current_year + 1))
    except:
        seasons_to_process = list(range(1950, current_year + 1))
        print("No checkpoint found - processing all seasons from 1950")

    # Force include current season (even if already processed)
    if current_year not in seasons_to_process:
        seasons_to_process.append(current_year)
        seasons_to_process = sorted(seasons_to_process)
        print(f"Added current season {current_year} to processing list")

    print(f"Processing seasons: {seasons_to_process}")

    # Rate limiter implementation remains the same
    class RateLimiter:
        def __init__(self, burst_limit=4, hourly_limit=500):
            self.burst_limit = burst_limit
            self.hourly_limit = hourly_limit
            self.request_timestamps = []

        def wait_if_needed(self):
            current_time = time.time()
            self.request_timestamps = [ts for ts in self.request_timestamps if current_time - ts < 3600]

            if len(self.request_timestamps) >= self.hourly_limit:
                oldest = min(self.request_timestamps)
                sleep = 3600 - (current_time - oldest) + 1
                print(f"Hourly limit reached. Sleeping {sleep:.1f}s")
                time.sleep(sleep)
                return self.wait_if_needed()

            recent = [ts for ts in self.request_timestamps if current_time - ts < 1]
            if len(recent) >= self.burst_limit:
                time.sleep(1)

            self.request_timestamps.append(current_time)

    rate_limiter = RateLimiter()

    def make_api_request(url, params, retries=3):
        for attempt in range(retries):
            rate_limiter.wait_if_needed()
            try:
                response = requests.get(url, params=params)
                if response.status_code == 200:
                    return response.json()
                elif response.status_code == 429:
                    wait = min(30, (2 ** attempt) + 1)
                    print(f"Rate limited. Waiting {wait}s")
                    time.sleep(wait)
                else:
                    print(f"HTTP {response.status_code}. Retry {attempt+1}/{retries}")
                    time.sleep(1)
            except Exception as e:
                print(f"Error: {str(e)}. Retry {attempt+1}/{retries}")
                time.sleep(1)
        return None

    def fetch_season_races(season):
        """Fetch and enrich races for a season"""
        print(f"Fetching season {season}")
        url = f"https://api.jolpi.ca/ergast/f1/{season}/results/"

        # Get initial count
        initial = make_api_request(url, {"limit": 1, "offset": 0})
        if not initial or "MRData" not in initial:
            return None

        total = int(initial["MRData"]["total"])
        limit = 100
        all_races = []

        for offset in range(0, total, limit):
            result = make_api_request(url, {"limit": limit, "offset": offset})
            if result and "MRData" in result and "RaceTable" in result["MRData"]:
                all_races.extend(result["MRData"]["RaceTable"].get("Races", []))

        # Add metadata to each race
        enriched = []
        ingestion_ts = datetime.now().isoformat()
        for race in all_races:
            race["season"] = season
            race["source"] = "jolpica_api"
            race["ingestion_timestamp"] = ingestion_ts
            enriched.append(race)

        return enriched

    # Process seasons
    for season in seasons_to_process:
        print(f"\nProcessing season {season}")
        season_dir = f"{bronze_path}/season={season}"

        # Handle current season differently
        if season == current_year:
            print(f"Force refreshing current season {season}")

            # Delete existing directory if it exists
            try:
                hadoop_path = spark._jvm.org.apache.hadoop.fs.Path(season_dir)
                fs = hadoop_path.getFileSystem(spark._jsc.hadoopConfiguration())
                if fs.exists(hadoop_path):
                    print(f"Deleting existing data for season {season}")
                    fs.delete(hadoop_path, True)  # recursive delete
            except Exception as e:
                print(f"Error deleting directory: {str(e)}")
                pass
        else:
            # Skip existing non-current seasons
            try:
                hadoop_path = spark._jvm.org.apache.hadoop.fs.Path(season_dir)
                fs = hadoop_path.getFileSystem(spark._jsc.hadoopConfiguration())
                if fs.exists(hadoop_path):
                    print(f"Skipping existing non-current season {season}")
                    continue
            except:
                pass

        # Rest of processing logic remains unchanged...
        # Fetch data
        races = fetch_season_races(season)
        if not races:
            print(f"No data for season {season}")
            continue

        # Write using Spark
        sc = spark.sparkContext
        rdd = sc.parallelize([json.dumps(race) for race in races])
        rdd.coalesce(1).saveAsTextFile(season_dir)  # Now safe to write

        # Update checkpoint
        checkpoint_data = [{
            "timestamp": datetime.now().isoformat(),
            "last_processed_season": season,
            "records_processed": len(races)
        }]
        spark.createDataFrame(checkpoint_data).write.mode("append").parquet(checkpoint_path)

        print(f"Saved {len(races)} races for season {season}")

    print("Bronze layer update complete")
    return seasons_to_process

spark = SparkSession.builder.appName("F1MedallionPipeline").getOrCreate()
fetch_f1_bronze_layer_incremental(
        spark)

Last processed season from checkpoint: 2011
Processing seasons: [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]

Processing season 2012
Fetching season 2012
Saved 24 races for season 2012

Processing season 2013
Fetching season 2013
Saved 23 races for season 2013

Processing season 2014
Fetching season 2014
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Saved 23 races for season 2014

Processing season 2015
Fetching season 2015
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Saved 22 races for season 2015

Processing season 2016
Fetching season 2016
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Saved 25 races for season 2016

Processing season 2017
Fetching season 2017
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
Rate limited. Waiting 2s
R

[2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022,
 2023,
 2024,
 2025]