# Curation Estaciones Calidad del Aire

Estudiar de que contaminantes no hay datos suficientes para el estudio en cada estacion. Se ha de comprobar que hay datos ($> 90\%$) durante el periodo de interes (`1-Enero-2020` <=> `30-Junio-2020`). Tambien se comprueban si hay intervalos largos de tiempo sin datos, utilizando una resolucion minima semanal, mensual y anual.

|   site   |  Pollutant   |   start_yr   |    end_yr    | hv.min  | missing.wk | missing.mnth | missing.yr |
|----------|--------------|--------------|--------------|---------|------------|--------------|------------|
| es0001a  |     no2      |  01-01-2015  |  02-01-2015  |  TRUE   |     34     |      2       |     0      |
| es0001a  |     no       |  01-01-2015  |  02-01-2015  |  TRUE   |     40     |     12       |     1      |
| es0001a  |      o3      |  01-01-2015  |  02-01-2015  |  FALSE  |      4     |      0       |     0      |

Puesto que el estudio completo incluye 67 estaciones, supondria demasiado tiempo de computacion, por lo que en este notebook solo se trabajara con 3 estaciones. El calculo completo de todas las estaciones se realiza en un script de ```R```

In [1]:
setwd("~/Repositories/AirQualityCOVID/")

In [2]:
source("src/Curation/general.R")

## Funciones para filtrar datos

In [3]:
suppressMessages(library(lubridate))
suppressMessages(library(tidyverse))
suppressMessages(library(plyr))

In [4]:
have.2020 <- function(dataFrame, start_dt = ymd("2020-01-01"),
                                 end_dt = ymd("2020-06-30")) {
    
    if (as_date(min(dataFrame$date)) > start_dt) {
        boolean <- FALSE
    } else if (as_date(max(dataFrame$date)) < end_dt) {
        boolean <- FALSE
    } else {
        new.df <- dataFrame[dataFrame$date >= start_dt &
                            dataFrame$date <= end_dt, ]

        amount.data <- (sum(!is.na(new.df$value)) / length(new.df$value))

        boolean <- amount.data >= 0.9
    }
    boolean
}

In [5]:
get.missing <- function(dataFrame, unit="week", 
                                   start_dt=ymd("2013-01-01"), 
                                   end_dt=ymd("2020-12-31")) {
    
    conversion <- list("hour" = 3600,
                       "day" = 3600*24,
                       "week" = 3600*24*7,
                       "month" = (3600*24*365)/12,
                       "year" = 3600*24*365
                       )
    
    new.df <- group.by.date(list("value"=dataFrame$value),
                            list("date"=dataFrame$date),
                            dataFrame, unit=unit, FUN=mean)
    period <- (interval(round_date(start_dt, unit=unit),
                        round_date(end_dt, unit=unit)
                       )
               / conversion[[unit]])
    
    amount <- as.integer(period) - sum(!is.na(new.df$value))
    if (amount == -1) {
        amount + 1
    } else {
        amount
    }
}

In [6]:
info_sitesAQ <- function(pollut, st, start_dt, end_dt) {
    dataAQPLL <- get.AQdata(site=st, pollutant=pollut, 
                                start_dt = start_dt, end_dt=end_dt,
                                data.by.file=FALSE)
    
        if (nrow(dataAQPLL) > 0) {
            start_yr <- as_date(min(dataAQPLL$date))
            end_yr <- as_date(max(dataAQPLL$date))
            
            hv.min <- have.2020(dataAQPLL)

            mss.wk <- get.missing(dataAQPLL, unit="week")
            mss.mnth <- get.missing(dataAQPLL, unit="month")
            mss.yr <- get.missing(dataAQPLL, unit="year")

            new.row <- data.frame(site=st, Pollutant=pollut, 
                                  start_yr, end_yr,
                                  hv.min, mss.wk, mss.mnth, mss.yr)
            new.row
        }
}

## Estaciones de Calidad del Aire para el estudio

In [7]:
all.sites <- read.csv("data/Curation/AirQuality/sitesAQ.csv",
                      stringsAsFactor=TRUE)

sites.lv <- levels(all.sites$site)[1:3]
pollutants.lv <- c("no", "no2", "o3", "pm10", "pm2.5")

## Informacion Relevante

```R
all.info <- data.frame()

for (st in sites.lv) {
    for (pll in pollutants.lv) {
        dataAQPLL <- get.AQdata(site=st, pollutant=pll, 
                                start_dt = ymd("2013-01-01"), end_dt=ymd("2020-12-31"),
                                data.by.file=FALSE)
        if (nrow(dataAQPLL) > 0) {
            start_yr <- as_date(min(dataAQPLL$date))
            end_yr <- as_date(max(dataAQPLL$date))
            
            hv.min <- have.2020(dataAQPLL)

            mss.wk <- get.missing(dataAQPLL, unit="week")
            mss.mnth <- get.missing(dataAQPLL, unit="month")
            mss.yr <- get.missing(dataAQPLL, unit="year")

            new.row <- data.frame(site=st, Pollutant=pll, 
                                  start_yr, end_yr,
                                  hv.min, mss.wk, mss.mnth, mss.yr)
            all.info <- rbind(new.row, all.info)
        }
    }
}
```

In [8]:
all.info <- do.call(rbind.fill, 
                    lapply(sites.lv, 
                           function(st, polluts){
                               do.call(rbind.fill,
                                       lapply(polluts, 
                                              info_sitesAQ, 
                                              st, ymd("2013-01-01"), 
                                              ymd("2020-12-31"))
                                      )
                           }, pollutants.lv)
                   )

In [9]:
head(all.info)

Unnamed: 0_level_0,site,Pollutant,start_yr,end_yr,hv.min,mss.wk,mss.mnth,mss.yr
Unnamed: 0_level_1,<fct>,<fct>,<date>,<date>,<lgl>,<int>,<int>,<int>
1,es0041a,no2,2017-01-01,2020-12-30,True,214,48,4
2,es0041a,pm10,2017-01-01,2020-12-30,True,211,48,4
3,es0041a,pm2.5,2020-02-27,2020-12-31,False,375,85,7
4,es0110a,no,2014-01-01,2020-12-31,True,66,15,1
5,es0110a,no2,2014-01-01,2020-12-31,True,66,15,1
6,es0110a,pm10,2014-04-30,2020-12-31,True,69,15,1


### Guardar Datos en csv <a id="saveAQ"></a>

```R
write.csv(all.info, 
          "data/Curation/AirQuality/info_sitesAQ.csv", row.names=FALSE)
```