In [None]:
import sqlite3
import pandas as pd
import os
from datetime import datetime

In [None]:
# create database directory
os.makedirs('data', exist_ok=True)

# connect to sqlite database
db_path = 'data/analysis.db'
conn = sqlite3.connect(db_path)
print(f"Connected to database: {db_path}")

In [None]:
# create air quality table
conn.execute('''
    CREATE TABLE IF NOT EXISTS air_quality (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        date_local TEXT,
        state_code TEXT,
        county_code TEXT,
        site_number TEXT,
        parameter_code TEXT,
        parameter TEXT,
        sample_duration TEXT,
        arithmetic_mean REAL,
        max_value REAL,
        units_of_measure TEXT,
        latitude REAL,
        longitude REAL,
        local_site_name TEXT,
        city TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
''')

print("Created air_quality table")

In [None]:
# create hospital visits table
conn.execute('''
    CREATE TABLE IF NOT EXISTS hospital_visits (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        mmwr_week TEXT,
        week_start TEXT,
        week_end TEXT,
        season TEXT,
        respiratory_category TEXT,
        visit_type TEXT,
        demographic_category TEXT,
        demographic_group TEXT,
        percent REAL,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
''')

print("Created hospital_visits table")

In [None]:
# create monthly analysis table
conn.execute('''
    CREATE TABLE IF NOT EXISTS monthly_analysis (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        month TEXT,
        respiratory_category TEXT,
        parameter TEXT,
        avg_pollution REAL,
        avg_visit_percent REAL,
        record_count INTEGER,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        UNIQUE(month, respiratory_category, parameter)
    )
''')

print("Created monthly_analysis table")

In [None]:
# create metadata table
conn.execute('''
    CREATE TABLE IF NOT EXISTS metadata (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        table_name TEXT,
        record_count INTEGER,
        last_updated TIMESTAMP,
        data_source TEXT,
        notes TEXT
    )
''')

conn.commit()
print("Created metadata table")

In [None]:
# load air quality data from csv
print("Loading air quality data...")
air_df = pd.read_csv('data/air.csv', low_memory=False)

print(f"Loaded {len(air_df):,} raw air quality records")

In [None]:
# select relevant columns
air_columns = [
    'date_local', 'state_code', 'county_code', 'site_number',
    'parameter_code', 'parameter', 'sample_duration',
    'arithmetic_mean', 'first_max_value', 'units_of_measure',
    'latitude', 'longitude', 'local_site_name', 'city'
]

existing_cols = [col for col in air_columns if col in air_df.columns]
air_clean = air_df[existing_cols].copy()

# rename for consistency
if 'first_max_value' in air_clean.columns:
    air_clean['max_value'] = air_clean['first_max_value']
    air_clean.drop('first_max_value', axis=1, inplace=True)

print(f"Prepared {len(air_clean):,} records with {len(air_clean.columns)} columns")
air_clean.head()

In [None]:
# load into database
air_clean.to_sql('air_quality', conn, if_exists='replace', index=False)
print(f"Loaded {len(air_clean):,} air quality records into database")

# update metadata
conn.execute('''
    INSERT OR REPLACE INTO metadata (table_name, record_count, last_updated, data_source, notes)
    VALUES (?, ?, ?, ?, ?)
''', ('air_quality', len(air_clean), datetime.now().isoformat(), 
      'EPA AQS API (dailyData/byCounty)', 'Chicago PM2.5 daily measurements 2015-2024'))
conn.commit()

In [None]:
# load hospital visit data from csv
print("Loading hospital visit data...")
hosp_df = pd.read_csv('data/hosp_data.csv')

print(f"Loaded {len(hosp_df):,} hospital visit records")
hosp_df.head()

In [None]:
# select relevant columns
hosp_columns = [
    'mmwr_week', 'week_start', 'week_end', 'season',
    'respiratory_category', 'visit_type', 'demographic_category',
    'demographic_group', 'percent'
]

existing_hosp_cols = [col for col in hosp_columns if col in hosp_df.columns]
hosp_clean = hosp_df[existing_hosp_cols].copy()

print(f"Prepared {len(hosp_clean):,} records with {len(hosp_clean.columns)} columns")

In [None]:
# load into database
hosp_clean.to_sql('hospital_visits', conn, if_exists='replace', index=False)
print(f"Loaded {len(hosp_clean):,} hospital visit records into database")

# update metadata
conn.execute('''
    INSERT OR REPLACE INTO metadata (table_name, record_count, last_updated, data_source, notes)
    VALUES (?, ?, ?, ?, ?)
''', ('hospital_visits', len(hosp_clean), datetime.now().isoformat(),
      'City of Chicago Open Data Portal', 'Weekly respiratory illness ED visits by category and demographics'))
conn.commit()

In [None]:
# create monthly aggregation
print("Creating monthly aggregation...")
air_db = pd.read_sql('SELECT * FROM air_quality', conn)
hosp_db = pd.read_sql('SELECT * FROM hospital_visits', conn)

print(f"Air quality records: {len(air_db):,}")
print(f"Hospital records: {len(hosp_db):,}")

In [None]:
# convert dates to datetime
air_db['date_local'] = pd.to_datetime(air_db['date_local'], errors='coerce')
hosp_db['week_start'] = pd.to_datetime(hosp_db['week_start'], errors='coerce')

# create month columns
air_db['month'] = air_db['date_local'].dt.to_period('M').astype(str)
hosp_db['month'] = hosp_db['week_start'].dt.to_period('M').astype(str)

print("Converted dates and created month columns")

In [None]:
# aggregate air quality by month and parameter
air_monthly = (
    air_db.groupby(['month', 'parameter'])
    .agg({'arithmetic_mean': 'mean'})
    .reset_index()
    .rename(columns={'arithmetic_mean': 'avg_pollution'})
)

print(f"Air quality monthly records: {len(air_monthly):,}")
air_monthly.head()

In [None]:
# aggregate hospital data by month and respiratory category
hosp_monthly = (
    hosp_db.groupby(['month', 'respiratory_category'])
    .agg({'percent': 'mean'})
    .reset_index()
    .rename(columns={'percent': 'avg_visit_percent'})
)

print(f"Hospital monthly records: {len(hosp_monthly):,}")
hosp_monthly.head()

In [None]:
# merge datasets on month
merged_monthly = hosp_monthly.merge(air_monthly, on='month', how='inner')
merged_monthly['record_count'] = 1

print(f"Merged monthly records: {len(merged_monthly):,}")
print(f"Date range: {merged_monthly['month'].min()} to {merged_monthly['month'].max()}")
print(f"Respiratory categories: {merged_monthly['respiratory_category'].nunique()}")
print(f"Pollutant parameters: {merged_monthly['parameter'].nunique()}")

merged_monthly.head(10)

In [None]:
# load merged data into database
merged_monthly.to_sql('monthly_analysis', conn, if_exists='replace', index=False)
print(f"Loaded {len(merged_monthly):,} monthly analysis records into database")

# update metadata
conn.execute('''
    INSERT OR REPLACE INTO metadata (table_name, record_count, last_updated, data_source, notes)
    VALUES (?, ?, ?, ?, ?)
''', ('monthly_analysis', len(merged_monthly), datetime.now().isoformat(),
      'Derived from air_quality and hospital_visits',
      'Monthly aggregated data for correlation and regression analysis'))
conn.commit()

In [None]:
# database summary
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()

print("="*60)
print("DATABASE SUMMARY")
print("="*60)

for (table_name,) in tables:
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    count = cursor.fetchone()[0]
    print(f"{table_name:30} {count:>10,} records")

print("="*60)

In [None]:
# display metadata
metadata_df = pd.read_sql('SELECT * FROM metadata ORDER BY last_updated DESC', conn)
metadata_df

In [None]:
# monthly analysis summary statistics
query = '''
    SELECT 
        respiratory_category,
        parameter,
        COUNT(*) as month_count,
        ROUND(AVG(avg_pollution), 2) as mean_pollution,
        ROUND(AVG(avg_visit_percent), 4) as mean_visit_pct,
        ROUND(MIN(avg_pollution), 2) as min_pollution,
        ROUND(MAX(avg_pollution), 2) as max_pollution
    FROM monthly_analysis
    GROUP BY respiratory_category, parameter
    ORDER BY respiratory_category, parameter
'''

summary = pd.read_sql(query, conn)
summary

In [None]:
# close database connection
conn.close()
print("Database connection closed")
print(f"Database location: {os.path.abspath(db_path)}")