# Project 1: Air Quality Analysis for UNMC

The UNMC Water, Climate and Health working group is interested in understanding potential air quality in Nebraska, if air quality is meeting National Ambient Air Quality Standards (NAAQS: https://www.epa.gov/criteria-air-pollutants/naaqs-table ) and where potential “hotspots” exist that may be related to public health concerns. The UNMC group wants to understand how temperature, humidity, elevation and geographic location all have an impact on air quality in Nebraska.

In [3]:
import pandas as pd

In [4]:
air_data = pd.read_csv('AirQuality_Daily_StudentVersion.csv')

air_data = pd.DataFrame(air_data)

air_data

Unnamed: 0,date,monitor_index,humidity,pressure,temperature,voc,analog_input,pm2.5_alt,pm1.0_atm,pm2.5_atm,pm10.0_atm,sensor.latitude,sensor.longitude,sensor.altitude,sensor.name
0,02/23/24,195089,14.377667,912.884333,62.266667,51.998667,0.051333,0.100000,0.000000,0.002500,0.039667,40.050922,-101.533570,3005,Swnphd-Benklemen
1,02/23/24,195365,12.223600,926.403000,71.193400,64.920800,0.000000,0.180000,0.004800,0.020000,0.176000,40.200330,-100.639885,2576,Swnphd-mccook
2,02/23/24,195541,20.095750,905.670750,61.008250,68.307000,0.020000,0.162500,0.004125,0.014812,0.063937,41.128284,-101.720220,3220,Swnphd-ogallala
3,02/24/24,195089,25.368000,911.708833,51.462458,91.176750,0.052667,0.437500,0.099542,0.170667,0.355208,40.050922,-101.533570,3005,Swnphd-Benklemen
4,02/24/24,195365,23.703083,925.282125,56.818208,107.863708,0.000000,0.475000,0.099208,0.231687,0.548583,40.200330,-100.639885,2576,Swnphd-mccook
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8296,03/24/25,195373,23.636208,915.998292,61.489083,421.493750,0.020000,0.762500,0.486583,0.830021,1.416125,41.148170,-100.761800,2800,WCDHD City Building
8297,03/24/25,195379,28.043545,954.294546,66.009182,301.074800,0.000000,0.800000,5.625545,5.873364,6.056773,40.877007,-97.579445,1639,FCHD-YPS
8298,03/24/25,195383,31.451375,936.556583,55.752792,353.478000,0.050000,0.575000,0.267500,0.478375,0.818146,41.779380,-99.138040,2174,Loup Basin Public Health Department
8299,03/24/25,195385,,,,,0.010000,0.970833,0.739500,1.088604,1.340563,40.019714,-98.071840,1597,South Heartland District Health Dept. Superior...


# Task 1

Top 5 locations by mean & median (VOC, PM2.5, PM10)

top 5 locations by PM10

In [5]:
PM10_stats = air_data.groupby('sensor.name')['pm10.0_atm'].agg(['mean', 'median'])


top5_PM10 = PM10_stats.sort_values(by='median', ascending=False).head(5)

print(top5_PM10)

                                                          mean     median
sensor.name                                                              
Broken Bow                                          929.678512  43.179094
#16 - Richardson County Courthouse                  701.632446  13.305615
ELVPHD Norfolk HD 4                                  15.969341  11.474708
#18 - Southeast District Health Department- Tec...  614.227248  11.433729
ELVPHD Wisner HD 5                                   14.462116  10.935958


Top 5 locations by PM2.5

In [6]:
PM25_stats = air_data.groupby('sensor.name')['pm2.5_atm'].agg(['mean', 'median'])


top5_PM25 = PM25_stats.sort_values(by='median', ascending=False).head(5)

print(top5_PM25)

                                                          mean     median
sensor.name                                                              
Broken Bow                                          928.710593  36.050240
#16 - Richardson County Courthouse                  700.127342  11.977344
#18 - Southeast District Health Department- Tec...  613.175352  10.322875
ELVPHD Norfolk HD 4                                  13.369492   9.706229
ELVPHD Wisner HD 5                                   11.154420   8.464583


top 5 locations by voc

In [7]:

voc_stats = air_data.groupby('sensor.name')['voc'].agg(['mean', 'median'])


top5_voc = voc_stats.sort_values(by='median', ascending=False).head(5)

print(top5_voc)

                                             mean      median
sensor.name                                                  
Swnphd-ogallala                        399.434240  423.082292
Swnphd-mccook                          353.941581  381.468479
Three Rivers Public Health Department  370.216208  376.810167
FCHD-YPS                               372.462720  375.383500
ELVPHD Norfolk HD 4                    360.833744  368.580500


# Task 2

The days the maximum values occured and where they occured

In [12]:

air_data_date = air_data.groupby(['date']).agg({
    'pm2.5_atm': 'max',
    'pm10.0_atm': 'max',
    'voc': 'max'
})


air_data_date.columns = ['pm2.5_max', 'pm10_max', 'voc_max']


top5_voc_days = air_data_date.sort_values(by='voc_max', ascending=False).head(5)
print("Top 5 VOC days:\n", top5_voc_days[['voc_max']])

print("\n" + "-"*30 + "\n")


top5_pm25_days = air_data_date.sort_values(by='pm2.5_max', ascending=False).head(5)
print("Top 5 PM2.5 days:\n", top5_pm25_days[['pm2.5_max']])

print("\n" + "-"*30 + "\n")


top5_pm10_days = air_data_date.sort_values(by='pm10_max', ascending=False).head(5)
print("Top 5 PM10 days:\n", top5_pm10_days[['pm10_max']])

Top 5 VOC days:
               voc_max
date                 
06/24/24  1209.931571
10/05/24  1135.473000
11/23/24  1041.722000
10/12/24   884.649000
10/16/24   871.182000

------------------------------

Top 5 PM2.5 days:
             pm2.5_max
date                 
02/18/25  3782.823313
02/19/25  3401.999333
02/20/25  3243.746646
01/21/25  3209.817146
01/20/25  3087.149167

------------------------------

Top 5 PM10 days:
              pm10_max
date                 
02/18/25  3784.682542
02/19/25  3403.435958
02/20/25  3245.613354
01/21/25  3211.046813
01/20/25  3088.053896


In [13]:
# Find the specific row for the highest recorded PM2.5, PM10, and VOC
max_pm25 = air_data.loc[air_data['pm2.5_atm'].idxmax()]
max_pm10 = air_data.loc[air_data['pm10.0_atm'].idxmax()]
max_voc = air_data.loc[air_data['voc'].idxmax()]

print(f"Max PM2.5 occurred at {max_pm25['sensor.name']} on {max_pm25['date']}")
print(f"Max PM10 occurred at {max_pm10['sensor.name']} on {max_pm10['date']}")
print(f"Max VOC occurred at {max_voc['sensor.name']} on {max_voc['date']}")

Max PM2.5 occurred at #16 - Richardson County Courthouse on 02/18/25
Max PM10 occurred at #16 - Richardson County Courthouse on 02/18/25
Max VOC occurred at Swnphd-ogallala on 06/24/24


# Task 3

Humidity and temperature affect on Air Quality 

In [14]:

def humidity_bins(h):
    if h < 50:
        return 'Low (<50%)'
    elif 50 <= h <= 80:
        return 'High (50-80%)'
    else:
        return 'Very High (>80%)'


def temp_bins(t):
    if t < 32:
        return 'Below Freezing (<32F)'
    elif 32 <= t <= 50:
        return 'Cool (32-50F)'
    elif 51 <= t <= 70:
        return 'Warm (51-70F)'
    else:
        return 'Hot (>70F)'


air_data['humidity_cat'] = air_data['humidity'].apply(humidity_bins)
air_data['temp_cat'] = air_data['temperature'].apply(temp_bins)


print("--- Effect of Humidity on PM2.5 ---")
print(air_data.groupby('humidity_cat')['pm2.5_atm'].mean())

print("\n--- Effect of Temperature on PM2.5 ---")
print(air_data.groupby('temp_cat')['pm2.5_atm'].mean())

--- Effect of Humidity on PM2.5 ---
humidity_cat
High (50-80%)        80.874444
Low (<50%)           76.513935
Very High (>80%)    533.786399
Name: pm2.5_atm, dtype: float64

--- Effect of Temperature on PM2.5 ---
temp_cat
Below Freezing (<32F)    273.698560
Cool (32-50F)            141.658843
Hot (>70F)                72.309380
Warm (51-70F)             83.102238
Name: pm2.5_atm, dtype: float64


# Task 4 

Has there been any AQI health risks 

In [17]:

geo_cols = ['sensor.latitude', 'sensor.longitude', 'sensor.altitude']
pollutant_cols = ['pm2.5_atm', 'pm10.0_atm', 'voc']

print("--- Correlation: Geography vs. Air Quality ---")
geo_correlation = air_data[geo_cols + pollutant_cols].corr()


print(geo_correlation.loc[geo_cols, pollutant_cols])


air_data['elevation_group'] = pd.qcut(air_data['sensor.altitude'], q=3, labels=['Low Elevation', 'Mid Elevation', 'High Elevation'])

print("\n--- Average PM2.5 by Elevation Group ---")
print(air_data.groupby('elevation_group')[['pm2.5_atm', 'pm10.0_atm']].mean())

--- Correlation: Geography vs. Air Quality ---
                  pm2.5_atm  pm10.0_atm       voc
sensor.latitude   -0.067958   -0.067540 -0.062086
sensor.longitude   0.099645    0.100230 -0.142937
sensor.altitude   -0.058228   -0.058783  0.124631

--- Average PM2.5 by Elevation Group ---
                  pm2.5_atm  pm10.0_atm
elevation_group                        
Low Elevation    116.922252  118.744655
Mid Elevation     24.152354   25.482386
High Elevation   147.867249  149.067477


  print(air_data.groupby('elevation_group')[['pm2.5_atm', 'pm10.0_atm']].mean())
