In [1]:
import pandas as pd 

# load all datasets
az_cocci = pd.read_csv("../../data/processed/arizona_cm_combined_data.csv")
az_pop = pd.read_csv("../../data/processed/arizona_pop_est_1980-2023.csv")
az_met = pd.read_csv("../../data/processed/Arizona_Weather_Data_Daily_Updates_1994_to_2023.csv")
ca_cocci = pd.read_csv("../../data/processed/Cali_Monthly_Cases.csv")
ca_pop = pd.read_csv("../../data/processed/California_Population_2000-2023.csv")
ca_met = pd.read_csv("../../data/processed/California_Weather_Data_Daily_Updates_2001_to_2022.csv")
az_co = pd.read_csv("../../data/processed/arizona_CO_1993_2023.csv", low_memory=False)
az_no2 = pd.read_csv("../../data/processed/arizona_NO2_1993_2023.csv", low_memory=False)
az_oz = pd.read_csv("../../data/processed/arizona_Ozone_1993_2023.csv", low_memory=False)
az_pm2_5 = pd.read_csv("../../data/processed/arizona_PM2.5_1993_2023.csv", low_memory=False)
az_pm2_5_nonref = pd.read_csv("../../data/processed/arizona_PM2.5_nonref_1993_2023.csv", low_memory=False)
az_pm10 = pd.read_csv("../../data/processed/arizona_PM10_1993_2023.csv", low_memory=False)
az_so2 = pd.read_csv("../../data/processed/arizona_SO2_1993_2023.csv", low_memory=False)
az_tsp = pd.read_csv("../../data/processed/arizona_TSP_1993_2023.csv", low_memory=False)
ca_co = pd.read_csv("../../data/processed/california_CO_2000_2022.csv", low_memory=False)
ca_no2 = pd.read_csv("../../data/processed/california_NO2_2000_2022.csv", low_memory=False)
ca_oz = pd.read_csv("../../data/processed/california_Ozone_2000_2022.csv", low_memory=False)
ca_pm2_5 = pd.read_csv("../../data/processed/california_PM2.5_2000_2022.csv", low_memory=False)
ca_pm2_5_nonref = pd.read_csv("../../data/processed/california_PM2.5_nonref_2000_2022.csv", low_memory=False)
ca_pm10 = pd.read_csv("../../data/processed/california_PM10_2000_2022.csv", low_memory=False)
ca_so2 = pd.read_csv("../../data/processed/california_SO2_2000_2022.csv", low_memory=False)
ca_tsp = pd.read_csv("../../data/processed/california_TSP_2000_2022.csv", low_memory=False)
ca_az_air_pollutants = pd.read_csv("../../data/processed/air_pollutants_ca_and_az.csv", low_memory=False)
ca_air_pollutants = pd.read_csv("../../data/processed/air_pollutants_ca.csv", low_memory=False)
az_air_pollutants = pd.read_csv("../../data/processed/air_pollutants_az.csv", low_memory=False)

# list of dataframes and their titles
dfs = [az_cocci, az_pop, az_met, ca_cocci, ca_pop, ca_met, 
       az_co, az_no2, az_oz, az_pm2_5, az_pm2_5_nonref, az_pm10, az_so2, az_tsp,
       ca_co, ca_no2, ca_oz, ca_pm2_5, ca_pm2_5_nonref, ca_pm10, ca_so2, ca_tsp, 
       ca_az_air_pollutants, ca_air_pollutants, az_air_pollutants]

dfs_titles = ['Arizona Cocci Case Count', 
              'Arizona Population Estimates',
              'Arizona Weather Data',
              'California Cocci Case Count',
              'California Population Estimates',
              'California Weather Data',
              'Arizona CO Data 1993-2023',
              'Arizona NO2 Data 1993-2023',
              'Arizona Ozone Data 1993-2023',
              'Arizona PM2.5 Data 1993-2023',
              'Arizona PM2.5 Non-Reference Data 1993-2023',
              'Arizona PM10 Data 1993-2023',
              'Arizona SO2 Data 1993-2023',
              'Arizona TSP Data 1993-2023',
              'California CO Data 2000-2022',
              'California NO2 Data 2000-2022',
              'California Ozone Data 2000-2022',
              'California PM2.5 Data 2000-2022',
              'California PM2.5 Non-Reference Data 2000-2022',
              'California PM10 Data 2000-2022',
              'California SO2 Data 2000-2022',
              'California TSP Data 2000-2022',
              'California and Arizona Air Pollutants',
              'California Air Pollutants',
              'Arizona Air Pollutants']

def analyze_temporal_gaps(df, date_column, title, freq='D'):
    """
    Analyze temporal gaps in a dataset
    
    Parameters:
    - df: pandas DataFrame
    - date_column: name of the column containing dates
    - title: name of the dataset
    - freq: frequency to check for gaps ('D' for daily, 'M' for monthly, etc.)
    
    Returns DataFrame with gap analysis
    """
    # Convert date column to datetime if it's not already
    df[date_column] = pd.to_datetime(df[date_column])
    
    # Sort by date
    df = df.sort_values(date_column)
    
    # Create complete date range
    full_range = pd.date_range(start=df[date_column].min(), 
                             end=df[date_column].max(),
                             freq=freq)
    
    # Find missing dates
    existing_dates = set(df[date_column])
    missing_dates = [date for date in full_range if date not in existing_dates]
    
    # Calculate basic statistics
    stats = {
        'Dataset': title,
        'Start_Date': df[date_column].min(),
        'End_Date': df[date_column].max(),
        'Total_Days': len(full_range),
        'Days_With_Data': len(existing_dates),
        'Missing_Days': len(missing_dates),
        'Gaps_Percentage': (len(missing_dates) / len(full_range)) * 100,
        'Longest_Gap': 0,
        'Gap_Start_Date': None,
        'Gap_End_Date': None
    }
    
    # Find longest gap
    if missing_dates:
        missing_dates = sorted(missing_dates)
        gaps = []
        gap_start = missing_dates[0]
        prev_date = missing_dates[0]
        
        for date in missing_dates[1:]:
            if (date - prev_date).days > 1:
                gaps.append((gap_start, prev_date))
                gap_start = date
            prev_date = date
        gaps.append((gap_start, missing_dates[-1]))
        
        # Find longest gap
        longest_gap = max(gaps, key=lambda x: (x[1] - x[0]).days)
        stats['Longest_Gap'] = (longest_gap[1] - longest_gap[0]).days
        stats['Gap_Start_Date'] = longest_gap[0]
        stats['Gap_End_Date'] = longest_gap[1]
    
    return pd.DataFrame([stats])

# Create list of datasets and their date columns
dataset_info = [
    (az_cocci, 'date_column_name', 'Arizona Cocci Case Count'),
    (az_pop, 'date_column_name', 'Arizona Population Estimates'),
    # Add all your datasets here with their respective date column names
]

# Analyze all datasets
temporal_gaps = pd.concat([
    analyze_temporal_gaps(df, date_col, title)
    for df, date_col, title in dataset_info
], ignore_index=True)

# Sort by gap percentage
temporal_gaps = temporal_gaps.sort_values('Gaps_Percentage', ascending=False)

KeyError: 'date_column_name'