1. handled the formatting in time
2. imputed missing values in non-numeric columns with 'unknown'
3. imputed missing values in numeric columns with "-999" as it is an extreme value and easy to notice and filter out
4. imputed missing values in DATE with '9999-12-31'

In [None]:
CREATE OR REPLACE TEMPORARY TABLE temp_race_table AS
WITH median_time AS (
    SELECT 
        MEDIAN(CASE 
            WHEN POSITION(':' IN TIME) > 0 THEN 
                SPLIT_PART(TIME, ':', 1)::INT * 60 + SPLIT_PART(TIME, ':', 2)::FLOAT
            ELSE 
                TIME::FLOAT 
        END) AS median_value
    FROM (
        SELECT COALESCE(a.TIME, c.TIME) AS TIME
        FROM fastest_laps a
        LEFT JOIN fastestlaps_detailed b 
            ON a.DRIVER = b.DRIVER AND a.YEAR = b.YEAR AND a.CAR = b.CAR AND a.GRAND_PRIX = b.GRAND_PRIX
        LEFT JOIN qualifyings c 
            ON a.DRIVER = c.DRIVER AND a.YEAR = c.YEAR AND a.CAR = c.CAR AND a.GRAND_PRIX = c.GRAND_PRIX
    ) subquery
    WHERE TIME IS NOT NULL  -- Reference 'TIME' from the subquery here
),
most_frequent_values AS (
    SELECT 
        (SELECT CAR FROM fastest_laps GROUP BY CAR ORDER BY COUNT(*) DESC LIMIT 1) AS most_frequent_car,
        (SELECT DRIVER FROM fastest_laps GROUP BY DRIVER ORDER BY COUNT(*) DESC LIMIT 1) AS most_frequent_driver,
        (SELECT DRIVERCODE FROM fastestlaps_detailed GROUP BY DRIVERCODE ORDER BY COUNT(*) DESC LIMIT 1) AS most_frequent_drivercode,
        (SELECT GRAND_PRIX FROM fastest_laps GROUP BY GRAND_PRIX ORDER BY COUNT(*) DESC LIMIT 1) AS most_frequent_grand_prix,
        MEDIAN(LAPS) AS median_laps
    FROM qualifyings
)
SELECT DISTINCT 
    COALESCE(a.CAR, most_frequent_values.most_frequent_car, 'unknown') AS CAR,
    COALESCE(a.DRIVER, most_frequent_values.most_frequent_driver, 'unknown') AS DRIVER,
    COALESCE(b.DRIVERCODE, most_frequent_values.most_frequent_drivercode, 'unknown') AS DRIVERCODE,
    COALESCE(a.GRAND_PRIX, most_frequent_values.most_frequent_grand_prix, 'unknown') AS GRAND_PRIX,
    COALESCE(b.LAP, c.LAPS, most_frequent_values.median_laps, -999) AS LAPS,
    COALESCE(CASE 
                WHEN c.POS IS NULL THEN 'unknown'  -- Impute NULL POS with 'unknown'
                ELSE c.POS
             END, 'unknown') AS POS,  -- Ensure 'unknown' is used for NULL values in POS
    COALESCE(C.TIME, 'unknown') AS TIME,  -- Simplified handling of TIME
    COALESCE(a.YEAR, -999) AS YEAR,
    COALESCE(CAST(d.DATE AS STRING), '9999-12-31') AS DATE,
    COALESCE(d.WINNER, 'unknown') AS WINNER,
    COALESCE(d.WINNERCODE, 'unknown') AS WINNERCODE
FROM fastest_laps a
LEFT JOIN fastestlaps_detailed b 
    ON a.DRIVER = b.DRIVER AND a.YEAR = b.YEAR AND a.CAR = b.CAR AND a.GRAND_PRIX = b.GRAND_PRIX
LEFT JOIN qualifyings c 
    ON a.DRIVER = c.DRIVER AND a.YEAR = c.YEAR AND a.CAR = c.CAR AND a.GRAND_PRIX = c.GRAND_PRIX
LEFT JOIN race_summaries d 
    ON a.YEAR = d.YEAR AND a.GRAND_PRIX = d.GRAND_PRIX,
most_frequent_values, median_time;

-- Query to check POS column imputation
SELECT *
FROM temp_race_table