# Filtrar Pares Estacion-Contaminante

Filtrar aquellos pares Estacion-Contaminante que cumpla con los siguientes requisitos minimos:

* Tienen al menos el > 90% de los datos en el periodo de mayor interes ```hv.min == True```

* No tienen mas de 5 anhos de datos perdidos ```mss.yr < 5```

In [1]:
setwd("~/Repositories/AirQualityCOVID/")

## Funciones Utiles

In [2]:
get.miss.stations <- function(new.df, miss.df) {
    sitesAQ <- read.csv("data/Curation/AirQuality/sitesAQ.csv")
    
    miss.site <- levels(as.factor(miss.df[!(miss.df$site %in% new.df$site), "site"]))
    
    sitesAQ[sitesAQ$site %in% miss.site, c("site", "site_name")]
}

In [3]:
count.miss.pollutant <- function(dataframe) {
    count <- c()
    
    for (pll in levels(as.factor(dataframe$Pollutant))) {
        count <- c(count, 
                   nrow(dataframe[dataframe$Pollutant == pll, ])
                  )
        names(count)[length(count)] <- pll
    }
    count
}

## Todas las estaciones de estudio

In [4]:
info.sitesAQ <- read.csv("data/Curation/AirQuality/info_sitesAQ.csv",
                         stringsAsFactor=FALSE)
info.sitesAQ <- info.sitesAQ[complete.cases(info.sitesAQ), ]
head(info.sitesAQ)

Unnamed: 0_level_0,site,Pollutant,start_yr,end_yr,hv.min,mss.wk,mss.mnth,mss.yr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<int>,<int>
1,es0115a,no2,2013-01-01,2020-12-31,True,7,0,0
2,es0115a,no,2013-01-01,2020-12-31,True,7,0,0
3,es0110a,pm2.5,2014-04-30,2020-12-31,True,69,15,1
4,es0110a,pm10,2014-04-30,2020-12-31,True,69,15,1
5,es0110a,no2,2014-01-01,2020-12-31,True,66,15,1
6,es0110a,no,2014-01-01,2020-12-31,True,66,15,1


In [5]:
print(paste("Num Estaciones: ", 
            length(levels(as.factor(info.sitesAQ$site)))
            )
     )

print(paste("Pares (Estacion, contaminante): ", 
            nrow(info.sitesAQ)
            )
     )

[1] "Num Estaciones:  3"
[1] "Pares (Estacion, contaminante):  9"


## Datos Minimos

Tienen datos de la concentracion del contaminante en el intervalo minimo de estudio

In [6]:
sites.hvmin <- info.sitesAQ[info.sitesAQ$hv.min == TRUE &
                            !is.na(info.sitesAQ$hv.min), ]
head(sites.hvmin)

Unnamed: 0_level_0,site,Pollutant,start_yr,end_yr,hv.min,mss.wk,mss.mnth,mss.yr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<int>,<int>
1,es0115a,no2,2013-01-01,2020-12-31,True,7,0,0
2,es0115a,no,2013-01-01,2020-12-31,True,7,0,0
3,es0110a,pm2.5,2014-04-30,2020-12-31,True,69,15,1
4,es0110a,pm10,2014-04-30,2020-12-31,True,69,15,1
5,es0110a,no2,2014-01-01,2020-12-31,True,66,15,1
6,es0110a,no,2014-01-01,2020-12-31,True,66,15,1


In [7]:
print(paste("Num Estaciones: ", 
            length(levels(as.factor(sites.hvmin$site)))
            )
     )

print(paste("Pares (Estacion, contaminante): ", 
            nrow(sites.hvmin)
            )
     )

[1] "Num Estaciones:  3"
[1] "Pares (Estacion, contaminante):  8"


Se muestran los valores eliminados 

In [8]:
no.sites.hvmin <- info.sitesAQ[!info.sitesAQ$hv.min, ]

stations <- get.miss.stations(sites.hvmin, no.sites.hvmin)
pollut <- count.miss.pollutant(no.sites.hvmin)

print("Estaciones Perdidas: ")
for (i in 1:nrow(stations)) {
    print(paste("    ", 
                stations[i, "site"], ": ",
                stations[i, "site_name"], sep=""))
}


for (i in 1:length(pollut)) {
    print(paste(names(pollut[i]), pollut[[i]], sep=": "))
}

[1] "Estaciones Perdidas: "
[1] "    NA: NA"
[1] "    : "
[1] "pm2.5: 1"


# No Falta > 5 Anhos

In [9]:
sites.hvyr <- sites.hvmin[sites.hvmin$mss.yr < 5, ]
head(sites.hvyr)

Unnamed: 0_level_0,site,Pollutant,start_yr,end_yr,hv.min,mss.wk,mss.mnth,mss.yr
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>,<int>,<int>
1,es0115a,no2,2013-01-01,2020-12-31,True,7,0,0
2,es0115a,no,2013-01-01,2020-12-31,True,7,0,0
3,es0110a,pm2.5,2014-04-30,2020-12-31,True,69,15,1
4,es0110a,pm10,2014-04-30,2020-12-31,True,69,15,1
5,es0110a,no2,2014-01-01,2020-12-31,True,66,15,1
6,es0110a,no,2014-01-01,2020-12-31,True,66,15,1


In [10]:
print(paste("Num Estaciones: ", 
            length(levels(as.factor(sites.hvyr$site)))
            )
     )

print(paste("Pares (Estacion, contaminante): ", 
            nrow(sites.hvyr)
            )
     )

[1] "Num Estaciones:  3"
[1] "Pares (Estacion, contaminante):  8"


In [11]:
no.sites.hvyr <- sites.hvmin[sites.hvmin$mss.yr >= 5, ]

stations <- get.miss.stations(sites.hvyr, no.sites.hvyr)
pollut <- count.miss.pollutant(no.sites.hvyr)

print("Estaciones Perdidas: ")
for (i in 1:nrow(stations)) {
    print(paste("    ", 
                stations[i, "site"], ": ",
                stations[i, "site_name"], sep=""))
}


for (i in 1:length(pollut)) {
    print(paste(names(pollut[i]), pollut[[i]], sep=": "))
}

[1] "Estaciones Perdidas: "
[1] "    NA: NA"
[1] "    : "
character(0)
character(0)


### Guardar Datos en csv <a id="saveAQ"></a>

In [12]:
write.csv(no.sites.hvyr, 
          "data/Curation/AirQuality/Checked-sitesAQ.csv", row.names=FALSE)