In [1]:
import os
from pathlib import Path
cwd = Path.cwd()
if cwd.name == 'jupyter_notebooks':
    os.chdir(cwd.parent)
print('Working dir:', Path.cwd())

Working dir: c:\Projects\us-air-quality-dashboard


In [2]:
import pandas as pd
from pathlib import Path
src = Path('Data/pollution_us_2000_2016.csv')
fallback = Path('outputs/pollution_us_200_2016_clean.csv')
if src.exists():
    df = pd.read_csv(src, nrows=10000)
    print(f'Loaded {len(df)} rows from {src}')
elif fallback.exists():
    df = pd.read_csv(fallback, nrows=10000)
    print(f'Fallback: loaded {len(df)} rows from {fallback}')
else:
    print('No data available; creating empty DataFrame')
    df = pd.DataFrame()
display(df.head())

Loaded 10000 rows from Data\pollution_us_2000_2016.csv


Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,1.145833,4.2,21,
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,0.878947,2.2,23,25.0
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,1.145833,4.2,21,
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,0.878947,2.2,23,25.0
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,Parts per billion,1.958333,3.0,22,4.0,Parts per million,0.85,1.6,23,


In [3]:
# Chunk-processing (only runs if raw exists)
from pathlib import Path
file_path = Path('Data/pollution_us_2000_2016.csv')
out_path = Path('outputs/pollution_us_200_2016_clean.csv')
out_path.parent.mkdir(parents=True, exist_ok=True)
if file_path.exists():
    if out_path.exists():
        out_path.unlink()
    chunks = pd.read_csv(file_path, chunksize=200_000)
    first = True
    for i, chunk in enumerate(chunks):
        chunk['Date Local'] = pd.to_datetime(chunk.get('Date Local'), errors='coerce')
        chunk['Year'] = chunk['Date Local'].dt.year
        chunk['Month'] = chunk['Date Local'].dt.month
        chunk['Quarter'] = chunk['Date Local'].dt.quarter
        chunk.to_csv(out_path, mode='a', index=False, header=first)
        first = False
    print('Wrote cleaned CSV to', out_path)
else:
    print('Raw file missing; chunk-processing skipped')

Wrote cleaned CSV to outputs\pollution_us_200_2016_clean.csv


In [4]:
# Preview outputs (supports multiple historical filenames)
from pathlib import Path
candidates = [Path('outputs/pollution_us_200_2016_clean.csv'), Path('outputs/pollution_us_2000_2016_clean.csv'), Path('outputs/pollution_us_200_2016_clean.csv')]
preview = next((p for p in candidates if p.exists()), None)
print('Preview path:', preview)
if preview is not None:
    # read a small sample to avoid memory pressure
    df_clean = pd.read_csv(preview, low_memory=False, nrows=20)
    print('Sampled cleaned CSV rows:', len(df_clean))
    df_clean.columns = (df_clean.columns.str.strip().str.lower().str.replace(r'\s+', '_', regex=True).str.replace('-', '').str.replace('/', '_'))
    display(df_clean.head())
else:
    print('No cleaned CSV present')

Preview path: outputs\pollution_us_200_2016_clean.csv
Sampled cleaned CSV rows: 20


Unnamed: 0,unnamed:_0,state_code,county_code,site_num,address,state,county,city,date_local,no2_units,...,so2_1st_max_hour,so2_aqi,co_units,co_mean,co_1st_max_value,co_1st_max_hour,co_aqi,year,month,quarter
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,1.145833,4.2,21,,2000,1,1
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,21,13.0,Parts per million,0.878947,2.2,23,25.0,2000,1,1
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,,Parts per million,1.145833,4.2,21,,2000,1,1
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,23,,Parts per million,0.878947,2.2,23,25.0,2000,1,1
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,22,4.0,Parts per million,0.85,1.6,23,,2000,1,1
