In [1]:
import os
from pathlib import Path
cwd = Path.cwd()
if cwd.name == 'jupyter_notebooks':
    os.chdir(cwd.parent)
print('Working dir:', Path.cwd())

Working dir: c:\Projects\us-air-quality-dashboard


In [2]:
import pandas as pd
from pathlib import Path
src = Path('Data/pollution_us_2000_2016.csv')
fallback = Path('outputs/pollution_us_200_2016_clean.csv')
if src.exists():
    df = pd.read_csv(src, nrows=10000)
    print(f'Loaded {len(df)} rows from {src}')
elif fallback.exists():
    df = pd.read_csv(fallback, nrows=10000)
    print(f'Fallback: loaded {len(df)} rows from {fallback}')
else:
    print('No data available; creating empty DataFrame')
    df = pd.DataFrame()
display(df.head())

Loaded 10000 rows from Data\pollution_us_2000_2016.csv


Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,1.145833,4.2,21,
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,0.878947,2.2,23,25.0
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,1.145833,4.2,21,
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,0.878947,2.2,23,25.0
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,Parts per billion,1.958333,3.0,22,4.0,Parts per million,0.85,1.6,23,


In [3]:
# Chunk-processing (only runs if raw exists)
from pathlib import Path
file_path = Path('Data/pollution_us_2000_2016.csv')
out_path = Path('outputs/pollution_us_200_2016_clean.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)
if file_path.exists():
    if out_path.exists():
        out_path.unlink()
    chunks = pd.read_csv(file_path, chunksize=200_000)
    first = True
    for i, chunk in enumerate(chunks):
        chunk['Date Local'] = pd.to_datetime(chunk.get('Date Local'), errors='coerce')
        chunk['Year'] = chunk['Date Local'].dt.year
        chunk['Month'] = chunk['Date Local'].dt.month
        chunk['Quarter'] = chunk['Date Local'].dt.quarter
        chunk.to_csv(out_path, mode='a', index=False, header=first)
        first = False
    print('Wrote cleaned CSV to', out_path)
else:
    print('Raw file missing; chunk-processing skipped')

Wrote cleaned CSV to outputs\pollution_us_200_2016_clean.csv


In [4]:
# Preview outputs (supports multiple historical filenames)
from pathlib import Path
candidates = [Path('outputs/pollution_us_200_2016_clean.csv'), Path('outputs/pollution_us_2000_2016_clean.csv'), Path('outputs/pollution_us_200_2016_clean.csv')]
preview = next((p for p in candidates if p.exists()), None)
print('Preview path:', preview)
if preview is not None:
    # read a small sample to avoid memory pressure
    df_clean = pd.read_csv(preview, low_memory=False, nrows=20)
    print('Sampled cleaned CSV rows:', len(df_clean))
    df_clean.columns = (df_clean.columns.str.strip().str.lower().str.replace(r'\s+', '_', regex=True).str.replace('-', '').str.replace('/', '_'))
    display(df_clean.head())
else:
    print('No cleaned CSV present')

Preview path: outputs\pollution_us_200_2016_clean.csv
Sampled cleaned CSV rows: 20


Unnamed: 0,unnamed:_0,state_code,county_code,site_num,address,state,county,city,date_local,no2_units,...,so2_1st_max_hour,so2_aqi,co_units,co_mean,co_1st_max_value,co_1st_max_hour,co_aqi,year,month,quarter
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,1.145833,4.2,21,,2000,1,1
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,0.878947,2.2,23,25.0,2000,1,1
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,,Parts per million,1.145833,4.2,21,,2000,1,1
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,,Parts per million,0.878947,2.2,23,25.0,2000,1,1
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,22,4.0,Parts per million,0.85,1.6,23,,2000,1,1


In [5]:
# Step 3: Ensure numeric columns are detected and cleaned, then report summary and save numeric-fixed CSV
from pathlib import Path
import pandas as pd

# locate the cleaned CSV (supports historical names)
candidates = [Path('outputs/pollution_us_200_2016_clean.csv'), Path('outputs/pollution_us_2000_2016_clean.csv')]
clean_path = next((p for p in candidates if p.exists()), None)
if clean_path is None:
    print('No cleaned CSV found; skipping numeric-clean step')
else:
    print('Cleaning numeric columns in:', clean_path)
    # read a modest sample to infer dtypes and compute medians for filling
    sample = pd.read_csv(clean_path, nrows=2000)

    # Try to detect numeric columns from the sample first
    numeric_cols = [c for c in sample.columns if pd.api.types.is_numeric_dtype(sample[c])]

    # If none detected, attempt coercion heuristic: column is numeric if >=50% of non-null sample values coerce to numeric
    if not numeric_cols:
        numeric_cols = []
        for c in sample.columns:
            non_null = sample[c].notna().sum()
            if non_null == 0:
                continue
            coerced = pd.to_numeric(sample[c], errors='coerce')
            frac = coerced.notna().sum() / float(non_null)
            if frac >= 0.5:
                numeric_cols.append(c)

    print('Numeric columns:', numeric_cols)

    # compute medians from the sample for filling later (only for the detected numeric cols)
    medians = {}
    if numeric_cols:
        medians = sample[numeric_cols].median().to_dict()
        print('Using medians for fill (sample):')
        for k, v in medians.items():
            print(f'  {k}: {v}')

    # prepare output path for numeric-fixed CSV (do not commit file by default)
    out_numeric = Path('outputs/pollution_us_200_2016_clean_numeric_fixed.csv')
    out_numeric.parent.mkdir(parents=True, exist_ok=True)
    if out_numeric.exists():
        print('Removing existing numeric-fixed file:', out_numeric)
        out_numeric.unlink()

    # chunk through the full cleaned CSV, coerce numeric cols, fill missing with medians, write out, and compute null-change stats
    chunksize = 200_000
    total_rows = 0
    null_before = {c: 0 for c in sample.columns}
    null_after = {c: 0 for c in sample.columns}
    first_write = True
    for chunk in pd.read_csv(clean_path, chunksize=chunksize):
        total_rows += len(chunk)
        # accumulate before-null counts for all columns (coarse)
        for c in sample.columns:
            null_before[c] += int(chunk[c].isna().sum())
        if numeric_cols:
            for c in numeric_cols:
                chunk[c] = pd.to_numeric(chunk[c], errors='coerce')
                # fill with median from sample if available
                if c in medians and pd.notna(medians[c]):
                    chunk[c] = chunk[c].fillna(medians[c])
        # accumulate after-null counts
        for c in sample.columns:
            null_after[c] += int(chunk[c].isna().sum())
        # write chunk to numeric-fixed CSV
        chunk.to_csv(out_numeric, mode='a', index=False, header=first_write)
        first_write = False

    # summarize
    print('\nNumeric-clean summary:')
    print('  total rows scanned:', total_rows)
    # report top columns by percent-null before
    percent_before = {c: (null_before[c] / total_rows * 100) if total_rows>0 else 0 for c in sample.columns}
    percent_after = {c: (null_after[c] / total_rows * 100) if total_rows>0 else 0 for c in sample.columns}
    # show top 8 columns with highest null before
    top_before = sorted(percent_before.items(), key=lambda x: x[1], reverse=True)[:8]
    print('\nTop columns by percent-null (before -> after):')
    for c, p in top_before:
        print(f'  {c}: {p:.2f}% -> {percent_after[c]:.2f}% (nulls {null_before[c]} -> {null_after[c]})')

    # present small sample with numeric coercion applied for user inspection
    sample2 = sample.copy()
    if numeric_cols:
        for c in numeric_cols:
            sample2[c] = pd.to_numeric(sample2[c], errors='coerce')
            if c in medians and pd.notna(medians[c]):
                sample2[c] = sample2[c].fillna(medians[c])
    print('\nSample after numeric coercion/fill (first 5 rows):')
    display(sample2.head())
    print('\nWrote numeric-fixed CSV to:', out_numeric)

Cleaning numeric columns in: outputs\pollution_us_200_2016_clean.csv
Numeric columns: ['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'NO2 Mean', 'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI', 'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI', 'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI', 'Year', 'Month', 'Quarter']
Using medians for fill (sample):
  Unnamed: 0: 999.5
  State Code: 4.0
  County Code: 13.0
  Site Num: 3002.0
  NO2 Mean: 29.6742425
  NO2 1st Max Value: 55.0
  NO2 1st Max Hour: 19.0
  NO2 AQI: 52.0
  O3 Mean: 0.021125
  O3 1st Max Value: 0.043
  O3 1st Max Hour: 10.0
  O3 AQI: 36.0
  SO2 Mean: 1.4666665
  SO2 1st Max Value: 4.0
  SO2 1st Max Hour: 8.0
  SO2 AQI: 7.0
  CO Mean: 0.7625
  CO 1st Max Value: 1.7
  CO 1st Max Hour: 7.0
  CO AQI: 16.0
  Year: 2000.0
  Month: 6.0
  Quarter: 2.0



Numeric-clean summary:
  total rows scanned: 1746661

Top columns by percent-null (before -> after):
  CO AQI: 50.00% -> 0.00% (nulls 873323 -> 0)
  SO2 AQI: 49.98% -> 0.00% (nulls 872907 -> 0)
  Unnamed: 0: 0.00% -> 0.00% (nulls 0 -> 0)
  State Code: 0.00% -> 0.00% (nulls 0 -> 0)
  County Code: 0.00% -> 0.00% (nulls 0 -> 0)
  Site Num: 0.00% -> 0.00% (nulls 0 -> 0)
  Address: 0.00% -> 0.00% (nulls 0 -> 0)
  State: 0.00% -> 0.00% (nulls 0 -> 0)

Sample after numeric coercion/fill (first 5 rows):


Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,Year,Month,Quarter
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,1.145833,4.2,21,16.0,2000,1,1
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,0.878947,2.2,23,25.0,2000,1,1
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,7.0,Parts per million,1.145833,4.2,21,16.0,2000,1,1
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,7.0,Parts per million,0.878947,2.2,23,25.0,2000,1,1
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,22,4.0,Parts per million,0.85,1.6,23,16.0,2000,1,1



Wrote numeric-fixed CSV to: outputs\pollution_us_200_2016_clean_numeric_fixed.csv


In [None]:
# Step 4: Normalize headers to snake_case, standardize decimals, and impute AQI; write standardized CSV
import re
from pathlib import Path
import pandas as pd

# input is the numeric-fixed CSV if present, otherwise fall back to cleaned CSVs
in_path = Path('outputs/pollution_us_200_2016_clean_numeric_fixed.csv')
fallbacks = [Path('outputs/pollution_us_200_2016_clean.csv'), Path('outputs/pollution_us_2000_2016_clean.csv')]
if not in_path.exists():
    in_path = next((p for p in fallbacks if p.exists()), None)
if in_path is None:
    print('No input file found for standardization; skipping Step 4')
else:
    out_path = Path('outputs/pollution_us_200_2016_clean_numeric_standardized.csv')
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists():
        out_path.unlink()

    def to_snake(s):
        s = str(s).strip().lower()
        s = re.sub(r'[\\/]', '_', s)
        s = re.sub(r"\s+", '_', s)
        s = re.sub(r'[^0-9a-z_]', '', s)
        s = re.sub(r'_+', '_', s)
        return s

    # read a sample to infer columns and candidate numeric columns
    sample = pd.read_csv(in_path, nrows=2000)
    col_map = {c: to_snake(c) for c in sample.columns}
    sample = sample.rename(columns=col_map)

    # detect numeric columns in sample (dtype based + coercion heuristic)
    numeric_cols = [c for c in sample.columns if pd.api.types.is_numeric_dtype(sample[c])]
    if not numeric_cols:
        numeric_cols = []
        for c in sample.columns:
            non_null = sample[c].notna().sum()
            if non_null == 0:
                continue
            coerced = pd.to_numeric(sample[c].astype(str).str.replace(',', '.'), errors='coerce')
            if coerced.notna().sum() / float(non_null) >= 0.5:
                numeric_cols.append(c)

    print('Detected numeric columns:', numeric_cols)

    # prepare AQI medians from sample if available
    if 'aqi' in sample.columns and sample['aqi'].notna().any():
        overall_aqi_med = sample['aqi'].dropna().median()
        if 'year' in sample.columns:
            aqi_by_year = sample.groupby('year')['aqi'].median().to_dict()
        else:
            aqi_by_year = {}
    else:
        overall_aqi_med = None
        aqi_by_year = {}

    chunksize = 200_000
    total_rows = 0
    aqi_before = 0
    aqi_after = 0
    first = True
    for chunk in pd.read_csv(in_path, chunksize=chunksize):
        # rename headers to snake_case consistently
        chunk.rename(columns=col_map, inplace=True)
        total_rows += len(chunk)

        # standardize decimal separator on object columns (comma -> dot)
        for col in chunk.select_dtypes(include=['object']).columns:
            # only replace where commas appear to avoid unnecessary work
            if chunk[col].astype(str).str.contains(',', na=False).any():
                chunk[col] = chunk[col].astype(str).str.replace(',', '.', regex=False)

        # coerce numeric columns we detected
        for c in numeric_cols:
            if c in chunk.columns:
                chunk[c] = pd.to_numeric(chunk[c], errors='coerce')

        # AQI imputation if present
        if 'aqi' in chunk.columns:
            aqi_before += int(chunk['aqi'].isna().sum())
            if aqi_by_year:
                # fill by year where possible, else overall median
                def _fill(row):
                    if pd.notna(row.get('aqi')):
                        return row.get('aqi')
                    y = row.get('year') if 'year' in row.index else None
                    if pd.notna(y) and y in aqi_by_year and pd.notna(aqi_by_year[y]):
                        return aqi_by_year[y]
                    return overall_aqi_med
                chunk['aqi'] = chunk.apply(_fill, axis=1)
            else:
                if overall_aqi_med is not None:
                    chunk['aqi'] = chunk['aqi'].fillna(overall_aqi_med)
            aqi_after += int(chunk['aqi'].isna().sum())

        # write standardized chunk
        chunk.to_csv(out_path, mode='a', index=False, header=first)
        first = False

    print('\nWrote standardized CSV to:', out_path)
    print('Total rows:', total_rows)
    if overall_aqi_med is not None:
        print('AQI missing before:', aqi_before, 'after:', aqi_after)
    else:
        print('AQI not present in data; no imputation performed')

In [None]:
# Step 5: Calculate AQI per pollutant and compute overall AQI; write augmented CSV
from pathlib import Path
import pandas as pd
import math

# Input is the standardized CSV from Step 4 if present
std_path = Path('outputs/pollution_us_200_2016_clean_numeric_standardized.csv')
if not std_path.exists():
    print('Standardized input not found; skipping AQI calculation (expected at', std_path, ')')
else:
    out_aqi = Path('outputs/pollution_us_200_2016_with_aqi.csv')
    out_aqi.parent.mkdir(parents=True, exist_ok=True)
    if out_aqi.exists():
        print('Removing existing AQI file:', out_aqi)
        out_aqi.unlink()

    # EPA-like breakpoints for AQI calculation (assumes concentrations in common units)
    # Each entry is list of tuples: (C_low, C_high, I_low, I_high)
    breakpoints = {
        'pm25': [
            (0.0, 12.0, 0, 50),
            (12.1, 35.4, 51, 100),
            (35.5, 55.4, 101, 150),
            (55.5, 150.4, 151, 200),
            (150.5, 250.4, 201, 300),
            (250.5, 350.4, 301, 400),
            (350.5, 500.4, 401, 500)
        ],
        'pm10': [
            (0, 54, 0, 50),
            (55, 154, 51, 100),
            (155, 254, 101, 150),
            (255, 354, 151, 200),
            (355, 424, 201, 300),
            (425, 504, 301, 400),
            (505, 604, 401, 500)
        ],
        # SO2, NO2 are in ppb here; CO in ppm; O3 in ppb (8-hr)
        'so2': [
            (0, 35, 0, 50),
            (36, 75, 51, 100),
            (76, 185, 101, 150),
            (186, 304, 151, 200),
            (305, 604, 201, 300),
            (605, 804, 301, 400),
            (805, 1004, 401, 500)
        ],
        'no2': [
            (0, 53, 0, 50),
            (54, 100, 51, 100),
            (101, 360, 101, 150),
            (361, 649, 151, 200),
            (650, 1249, 201, 300),
            (1250, 1649, 301, 400),
            (1650, 2049, 401, 500)
        ],
        'co': [
            (0.0, 4.4, 0, 50),
            (4.5, 9.4, 51, 100),
            (9.5, 12.4, 101, 150),
            (12.5, 15.4, 151, 200),
            (15.5, 30.4, 201, 300),
            (30.5, 40.4, 301, 400),
            (40.5, 50.4, 401, 500)
        ],
        'o3': [
            (0, 54, 0, 50),
            (55, 70, 51, 100),
            (71, 85, 101, 150),
            (86, 105, 151, 200),
            (106, 200, 201, 300)
            # higher ranges omitted for brevity
        ]
    }

    def aqi_calc_from_breakpoints(C, bps):
        """Calculate AQI for a concentration C given breakpoint list bps"""
        if C is None or (isinstance(C, float) and math.isnan(C)):
            return None
        try:
            C = float(C)
        except Exception:
            return None
        for (Cl, Ch, Il, Ih) in bps:
            if Cl <= C <= Ch:
                # linear interpolation
                aqi = (Ih - Il) / (Ch - Cl) * (C - Cl) + Il
                return int(round(aqi))
        # if outside defined range, clip to 500 or return None
        if C > bps[-1][1]:
            return 500
        return None

    # find candidate column names in the standardized file that map to pollutants
    sample = pd.read_csv(std_path, nrows=5)
    cols = sample.columns.tolist()
    pollutant_column_map = {}
    # heuristics for column name detection
    mapping_hints = {
        'pm25': ['pm2_5', 'pm25', 'pm_2_5', 'pm2.5'],
        'pm10': ['pm10', 'pm_10'],
        'so2': ['so2', 's02', 'sulfur_dioxide'],
        'no2': ['no2', 'nitrogen_dioxide'],
        'co': ['co', 'carbon_monoxide'],
        'o3': ['o3', 'ozone']
    }
    for pol, hints in mapping_hints.items():
        for h in hints:
            matches = [c for c in cols if h in c]
            if matches:
                pollutant_column_map[pol] = matches[0]
                break

    print('Detected pollutant column mapping:', pollutant_column_map)

    # Now iterate chunks, compute pollutant AQIs and overall AQI
    chunksize = 200_000
    first = True
    total_rows = 0
    aqi_computed_count = 0
    # we'll write the augmented CSV with added columns: _aqi_<pollutant> and aqi, aqi_main_pollutant
    for chunk in pd.read_csv(std_path, chunksize=chunksize):
        total_rows += len(chunk)
        # compute AQI per pollutant
        pollutant_aqis = {}
        for pol, colname in pollutant_column_map.items():
            if colname in chunk.columns:
                pollutant_aqis[pol] = chunk[colname].map(lambda v: aqi_calc_from_breakpoints(v, breakpoints[pol]))
                # name the column
                chunk[f'aqi_{pol}'] = pollutant_aqis[pol]
        # compute overall AQI and primary pollutant
        if pollutant_aqis:
            # DataFrame of pollutant AQIs
            aqi_df = pd.DataFrame({pol: pollutant_aqis[pol] for pol in pollutant_aqis})
            # overall AQI is row-wise max (ignoring NaN)
            chunk['aqi_computed'] = aqi_df.max(axis=1)
            # primary pollutant: pollutant with max aqi per row
            def top_pol(row):
                row = row.to_dict()
                # choose pollutant with max AQI
                best = None
                best_val = -1
                for k, v in row.items():
                    if pd.isna(v):
                        continue
                    if v > best_val:
                        best_val = v
                        best = k
                return best
            chunk['aqi_main_pollutant'] = aqi_df.apply(top_pol, axis=1)
            # if existing 'aqi' column present, prefer computed values where available
            if 'aqi' in chunk.columns:
                # count how many computed will replace or fill
                will_fill = int(chunk['aqi_computed'].notna().sum())
                aqi_computed_count += will_fill
                # fill existing aqi where missing, otherwise keep existing
                chunk['aqi'] = chunk['aqi'].fillna(chunk['aqi_computed'])
            else:
                chunk['aqi'] = chunk['aqi_computed']
                aqi_computed_count += int(chunk['aqi'].notna().sum())
            # drop the helper column 'aqi_computed'
            chunk.drop(columns=['aqi_computed'], inplace=True)
        else:
            print('No pollutant columns detected for AQI computation in this dataset; skipping per-chunk AQI calc')

        # write augmented chunk
        chunk.to_csv(out_aqi, mode='a', index=False, header=first)
        first = False

    print('\nWrote AQI-augmented CSV to:', out_aqi)
    print('Total rows processed:', total_rows)
    print('AQI values computed/fill count:', aqi_computed_count)

In [1]:
from pathlib import Path
import shutil

# Source PDF (relative to project root) and destination path (absolute)
output_pdf = Path("jupyter_notebooks/outputs/cleaned_datasets_summary.pdf")
export_copy = Path("C:/Users/ifrah/Documents/cleaned_datasets_summary.pdf")

if not output_pdf.exists():
    print(f"Source PDF not found: {output_pdf.resolve()}")
else:
    try:
        # Ensure destination directory exists
        export_copy.parent.mkdir(parents=True, exist_ok=True)
        # use copy2 to preserve metadata where possible
        shutil.copy2(str(output_pdf), str(export_copy))
        print("Exported a copy to:", export_copy.resolve())
    except Exception as e:
        print("Failed to copy PDF:", e)

Source PDF not found: C:\Projects\us-air-quality-dashboard\jupyter_notebooks\jupyter_notebooks\outputs\cleaned_datasets_summary.pdf
