In [1]:
# Import relevant packages

import pandas as pd
import numpy as np
from scipy import stats

In [2]:
# Use read_csv() to import data

aqi = pd.read_csv('c4_epa_air_quality.csv')

In [3]:
# Explore dataframe `aqi`

print("Use head() to show a sample of data")
print(aqi.head())

print("Use describe() to summarize AQI")
print(aqi.describe(include='all'))

print("For a more thorough examination of observations by state use values_counts()")
print(aqi['state_name'].value_counts())

print('for a more')

Use head() to show a sample of data
   Unnamed: 0  date_local    state_name   county_name      city_name  \
0           0  2018-01-01       Arizona      Maricopa        Buckeye   
1           1  2018-01-01          Ohio       Belmont      Shadyside   
2           2  2018-01-01       Wyoming         Teton  Not in a city   
3           3  2018-01-01  Pennsylvania  Philadelphia   Philadelphia   
4           4  2018-01-01          Iowa          Polk     Des Moines   

                                     local_site_name   parameter_name  \
0                                            BUCKEYE  Carbon monoxide   
1                                          Shadyside  Carbon monoxide   
2  Yellowstone National Park - Old Faithful Snow ...  Carbon monoxide   
3                             North East Waste (NEW)  Carbon monoxide   
4                                          CARPENTER  Carbon monoxide   

    units_of_measure  arithmetic_mean  aqi  
0  Parts per million         0.473684    7  
1 

In [4]:
# Hypothesis 1: mean AQI in Los Angeles County is statistically different from the rest of California.

# Create dataframes for each sample being compared in the test

ca_la = aqi[aqi['county_name']=='Los Angeles']
ca_other = aqi[(aqi['state_name']=='California') & (aqi['county_name']!='Los Angeles')]

In [None]:
# H0 There is no difference in the mean AQI between Los Angeles County and the rest of California.
# H1 There is a difference in the mean AQI between Los Angeles County and the rest of California.

In [5]:
# For this analysis, the significance level is 5%

significance_level = 0.05
significance_level

0.05

In [6]:
# Comparing Means between two independant samples. Therefore two-sample t-test.

# Compute p-value here

stats.ttest_ind(a=ca_la['aqi'], b=ca_other['aqi'], equal_var=False)

Ttest_indResult(statistic=2.1107010796372014, pvalue=0.049839056842410995)

In [None]:
# With a p-value (0.049) being less than 0.05 (as significance level is 5%), 
# reject the null hypothesis in favor of the alternative hypothesis.

In [7]:
# Hypothesis 2: Does New York have a lower AQI than Ohio?

# Create dataframes for each sample being compared in the test

ny = aqi[aqi['state_name']=='New York']
ohio = aqi[aqi['state_name']=='Ohio']

In [None]:
# H0: The mean AQI of New York is greater than or equal to that of Ohio.
# H1: The mean AQI of New York is below that of Ohio.

In [8]:
# Compute p-value here

tstat, pvalue = stats.ttest_ind(a=ny['aqi'], b=ohio['aqi'], alternative='less', equal_var=False)
print(tstat)
print(pvalue)

-2.025951038880333
0.030446502691934697


In [None]:
# With a p-value of (0.03) less than 0.05 (as significance level is 5%) and a t-statistic < 0 (-2.036)
# Reject the null hypothesus in favour of the alternative hypothesis

In [9]:
# Hypothesis 3: If a new policy will affect those states with a mean AQI of 10 or greater. 
# Can we rule out Michigan from being affected by this new policy?

# Create dataframes for each sample being compared in the test


michigan = aqi[aqi['state_name']=='Michigan']

In [None]:
# H0: The mean AQI of Michigan is less than or equal to 10.
# H1: the mean AQI of Michigan is greater than 10.

In [10]:
# Compute p-value here


tstat, pvalue = stats.ttest_1samp(michigan['aqi'], 10, alternative='greater')
print(tstat)
print(pvalue)

-1.7395913343286131
0.9399405193140109


In [None]:
# With a p-value (0.940) being greater than 0.05 (as significance level is 5%) and a t-statistic < 0 (-1.74)
# fail to reject the null hypothesis