In [1]:
import os
import pandas as pd
import csv
import sys

In [2]:
# Collect pollution csv data
file_to_load = "data/daily_la_airdata_epa.csv"

# Read pollution data file and store into pandas data frame
pollution_data = pd.read_csv(file_to_load)

# Check csv is being read and pull first 5 rows of data to 
# better understand what is contained within

pollution_data.head(10)

Unnamed: 0,Date,AQI Value,Main Pollutant,Site Name,Site ID,Source
0,01/01/2020,102,PM2.5,Long Beach-Route 710 Near Road,06-037-4008,AQS
1,01/02/2020,72,PM2.5,Compton,06-037-1302,AQS
2,01/03/2020,87,PM2.5,Long Beach-Route 710 Near Road,06-037-4008,AQS
3,01/04/2020,104,PM2.5,Compton,06-037-1302,AQS
4,01/05/2020,103,PM2.5,Compton,06-037-1302,AQS
5,01/06/2020,72,PM2.5,Long Beach-Route 710 Near Road,06-037-4008,AQS
6,01/07/2020,49,NO2,Long Beach-Route 710 Near Road,06-037-4008,AQS
7,01/08/2020,59,PM2.5,Long Beach-Route 710 Near Road,06-037-4008,AQS
8,01/09/2020,49,PM2.5,Long Beach-Route 710 Near Road,06-037-4008,AQS
9,01/10/2020,66,PM2.5,Compton,06-037-1302,AQS


In [60]:
# Collect COVID cases in Los Angeles csv data
anrgusc = "data/anrgusc-lacounty-convid19/Covid-19.csv"

# Read pollution data file and store into pandas data frame
la_covid_cases_data = pd.read_csv(anrgusc, encoding = 'utf-16')

# Check csv is being read and pull first 5 rows of data to 
# better understand what is contained within

la_covid_cases_data.head()


Unnamed: 0,Time Stamp,Region,Latitude,Longitude,Number of cases
0,01-1-2021,Acton,34.480742,-118.186838,271
1,01-1-2021,Adams-Normandie,34.031788,-118.300247,766
2,01-1-2021,Agoura Hills,34.14791,-118.765704,593
3,01-1-2021,Alhambra,34.093042,-118.12706,4241
4,01-1-2021,Alsace,33.988,-118.34762,1016


In [37]:
# Rename Date Stamp to Date so that we may merge data frames
covid_premerge = la_covid_cases_data.rename(columns={"Time Stamp": "Date"})
covid_premerge.head(10)


Unnamed: 0,Date,Region,Latitude,Longitude,Number of cases
0,01-1-2021,Acton,34.480742,-118.186838,271
1,01-1-2021,Adams-Normandie,34.031788,-118.300247,766
2,01-1-2021,Agoura Hills,34.14791,-118.765704,593
3,01-1-2021,Alhambra,34.093042,-118.12706,4241
4,01-1-2021,Alsace,33.988,-118.34762,1016
5,01-1-2021,Altadena,34.186316,-118.135233,2080
6,01-1-2021,Angeles National Forest,34.321655,-118.019201,4
7,01-1-2021,Angelino Heights,34.070289,-118.254796,178
8,01-1-2021,Arcadia,34.136208,-118.04015,1658
9,01-1-2021,Arleta,34.241327,-118.432205,4492


In [38]:
covid_premerge1 = covid_premerge[["Date", "Number of cases"]]
covid_premerge1

Unnamed: 0,Date,Number of cases
0,01-1-2021,271
1,01-1-2021,766
2,01-1-2021,593
3,01-1-2021,4241
4,01-1-2021,1016
...,...,...
86206,12-9-2020,432
86207,12-9-2020,2984
86208,12-9-2020,1879
86209,12-9-2020,2546


In [61]:
#df.groupby('date')['hours'].sum()

sum_cases = covid_premerge1.groupby('Date')['Number of cases'].sum()
sum_cases_df = pd.DataFrame(sum_cases)
sum_cases_df

Unnamed: 0_level_0,Number of cases
Date,Unnamed: 1_level_1
01-1-2021,704475
01-10-2021,818534
01-11-2021,829517
01-12-2021,840292
01-13-2021,851686
...,...
12-5-2020,394627
12-6-2020,403418
12-7-2020,410960
12-8-2020,418614


In [50]:
daily_totals_covid_premerge = covid_premerge1.groupby('Date')
daily_totals_covid_premerge.count()


Unnamed: 0_level_0,Number of cases
Date,Unnamed: 1_level_1
01-1-2021,238
01-10-2021,238
01-11-2021,237
01-12-2021,237
01-13-2021,238
...,...
12-5-2020,238
12-6-2020,238
12-7-2020,238
12-8-2020,238


In [4]:
# Find the number of times each site is used for data
sites = pollution_data["Site Name"].value_counts()
sites

Long Beach-Route 710 Near Road    101
Glendora                           75
Santa Clarita                      50
Reseda                             37
Pasadena                           35
North Hollywood (NOHO)             34
Compton                            25
Pomona                             21
Los Angeles-North Main Street      20
Long Beach (South)                 15
Lancaster-Division Street          10
Mission Viejo                       9
Anaheim                             6
West Los Angeles                    2
Signal Hill (LBSH)                  2
Pico Rivera #2                      2
Azusa                               1
Long Beach (Hudson)                 1
Name: Site Name, dtype: int64

In [5]:
# Find what kind of datatype we have for each column.
pollution_data.dtypes

Date              object
 AQI Value         int64
Main Pollutant    object
Site Name         object
Site ID           object
Source            object
dtype: object

In [24]:
# Sort AQI Value data highest to lowest
descending_aqi = pollution_data.sort_values(by=' AQI Value', ascending=False)
descending_aqi

Unnamed: 0,Date,AQI Value,Main Pollutant,Site Name,Site ID,Source
248,09/05/2020,235,Ozone,Glendora,06-037-0016,AQS
186,07/05/2020,225,PM2.5,Los Angeles-North Main Street,06-037-1103,AQS
230,08/18/2020,222,Ozone,Glendora,06-037-0016,AQS
247,09/04/2020,220,Ozone,Glendora,06-037-0016,AQS
227,08/15/2020,220,Ozone,Glendora,06-037-0016,AQS
...,...,...,...,...,...,...
385,01/20/2021,38,PM2.5,Los Angeles-North Main Street,06-037-1103,AirNow
157,06/06/2020,38,PM2.5,North Hollywood (NOHO),06-037-4010,AQS
95,04/05/2020,37,Ozone,Pomona,06-037-1701,AQS
99,04/09/2020,35,Ozone,Reseda,06-037-1201,AQS


In [23]:
#criteria = df[ df.iloc[:,1]>= 60 ]     
# Create a filtered data frame with AQI values higher than or equal to 150
unhealthy_aqi = descending_aqi[descending_aqi.iloc[:, 1]>=150]
unhealthy_aqi

Unnamed: 0,Date,AQI Value,Main Pollutant,Site Name,Site ID,Source
248,09/05/2020,235,Ozone,Glendora,06-037-0016,AQS
186,07/05/2020,225,PM2.5,Los Angeles-North Main Street,06-037-1103,AQS
230,08/18/2020,222,Ozone,Glendora,06-037-0016,AQS
247,09/04/2020,220,Ozone,Glendora,06-037-0016,AQS
227,08/15/2020,220,Ozone,Glendora,06-037-0016,AQS
...,...,...,...,...,...,...
270,09/27/2020,154,Ozone,Santa Clarita,06-037-6012,AQS
173,06/22/2020,154,Ozone,Santa Clarita,06-037-6012,AQS
269,09/26/2020,151,Ozone,Reseda,06-037-1201,AQS
278,10/05/2020,151,Ozone,Reseda,06-037-1201,AQS
