In [21]:
# Import relevant Python libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm


## Load data

In [22]:
data = pd.read_csv('modified_c4_epa_air_quality.csv', index_col=0)
data.head()

Unnamed: 0_level_0,state_name,county_name,city_name,local_site_name,parameter_name,units_of_measure,aqi_log
date_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-01,Arizona,Maricopa,Buckeye,BUCKEYE,Carbon monoxide,Parts per million,2.079442
2018-01-01,Ohio,Belmont,Shadyside,Shadyside,Carbon monoxide,Parts per million,1.791759
2018-01-01,Wyoming,Teton,Not in a city,Yellowstone National Park - Old Faithful Snow ...,Carbon monoxide,Parts per million,1.098612
2018-01-01,Pennsylvania,Philadelphia,Philadelphia,North East Waste (NEW),Carbon monoxide,Parts per million,1.386294
2018-01-01,Iowa,Polk,Des Moines,CARPENTER,Carbon monoxide,Parts per million,1.386294


## Data exploration

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 260 entries, 2018-01-01 to 2018-01-01
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   state_name        260 non-null    object 
 1   county_name       260 non-null    object 
 2   city_name         260 non-null    object 
 3   local_site_name   257 non-null    object 
 4   parameter_name    260 non-null    object 
 5   units_of_measure  260 non-null    object 
 6   aqi_log           260 non-null    float64
dtypes: float64(1), object(6)
memory usage: 16.2+ KB


In [33]:
data.shape

(260, 8)

In [24]:
# Get descriptive stats.
data.describe()

Unnamed: 0,aqi_log
count,260.0
mean,1.766921
std,0.714716
min,0.0
25%,1.098612
50%,1.791759
75%,2.302585
max,3.931826


- The count value for the `aqi` column is 260. This means there are 257 aqi measurements represented in this dataset.

- The 25th percentile for the `aqi` column is 1. This means that 25% of the aqi values in the data are below 1.

- The 75th percentile for the `aqi` column is 2.3. This means that 75% the aqi values in the data are below 2.3.

In [25]:
# Get descriptive stats about the states in the data.
data['state_name'].describe()

count            260
unique            52
top       California
freq              66
Name: state_name, dtype: object

- There are 260 states values, and 51 of them are unique. California is the most commonly occurring state in the data, with a frequency of 66. 

In [26]:
aqi_log_mean = data['aqi_log'].mean()
aqi_log_mean

1.7669210929985582

In [27]:
aqi_log_std = data['aqi_log'].std()
aqi_log_std

0.7147155520223721

In [28]:
lower_limit = aqi_log_mean - 1 * aqi_log_std
upper_limit = aqi_log_mean + 1 * aqi_log_std
((data['aqi_log'] >= lower_limit)& (data['aqi_log'] <= upper_limit)).mean()

0.7615384615384615

In [29]:
lower_limit = aqi_log_mean - 2 * aqi_log_std
upper_limit = aqi_log_mean + 2 * aqi_log_std
((data['aqi_log'] >= lower_limit)& (data['aqi_log'] <= upper_limit)).mean()

0.9576923076923077

In [30]:
lower_limit = aqi_log_mean - 3 * aqi_log_std
upper_limit = aqi_log_mean + 3 * aqi_log_std
((data['aqi_log'] >= lower_limit)& (data['aqi_log'] <= upper_limit)).mean()

0.9961538461538462

In [31]:
data['Z_score'] = stats.zscore(data['aqi_log'])
data

Unnamed: 0_level_0,state_name,county_name,city_name,local_site_name,parameter_name,units_of_measure,aqi_log,Z_score
date_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01,Arizona,Maricopa,Buckeye,BUCKEYE,Carbon monoxide,Parts per million,2.079442,0.438109
2018-01-01,Ohio,Belmont,Shadyside,Shadyside,Carbon monoxide,Parts per million,1.791759,0.034820
2018-01-01,Wyoming,Teton,Not in a city,Yellowstone National Park - Old Faithful Snow ...,Carbon monoxide,Parts per million,1.098612,-0.936873
2018-01-01,Pennsylvania,Philadelphia,Philadelphia,North East Waste (NEW),Carbon monoxide,Parts per million,1.386294,-0.533584
2018-01-01,Iowa,Polk,Des Moines,CARPENTER,Carbon monoxide,Parts per million,1.386294,-0.533584
...,...,...,...,...,...,...,...,...
2018-01-01,District Of Columbia,District of Columbia,Washington,Near Road,Carbon monoxide,Parts per million,1.386294,-0.533584
2018-01-01,Wisconsin,Dodge,Kekoskee,HORICON WILDLIFE AREA,Carbon monoxide,Parts per million,1.098612,-0.936873
2018-01-01,Kentucky,Jefferson,Louisville,CANNONS LANE,Carbon monoxide,Parts per million,1.098612,-0.936873
2018-01-01,Nebraska,Douglas,Omaha,,Carbon monoxide,Parts per million,2.302585,0.750924


In [32]:
data[(data['Z_score'] > 3 )| (data['Z_score'] < -3)]

Unnamed: 0_level_0,state_name,county_name,city_name,local_site_name,parameter_name,units_of_measure,aqi_log,Z_score
date_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01,Arizona,Maricopa,Phoenix,WEST PHOENIX,Carbon monoxide,Parts per million,3.931826,3.034886
