# Cross-Validation of `lm` model

Make a cross-validation with a "k-fold" using natural years

In [1]:
# Load packages
suppressMessages(library(lubridate))
suppressMessages(library(tidyverse))
suppressMessages(library(openair))

suppressMessages(library(caret))

In [2]:
suppressMessages(library(repr))
options(repr.plot.width=25,
        repr.plot.height=10,
        #repr.plot.pointsize=50,
        repr.plot.family='serif'
       )

In [3]:
# Working directory
setwd("~/Repositories/AirQualityCOVID")

## Main Variables

In [19]:
sites.lv <- c("es0118a", "es1438a") # Big cities (Madrid and Barcelona)",
sites.lv <- c(sites.lv, "es1580a", "es1340a") # small cities (Santander and Huelva)
variables.lv <- c("no2", "no")

## DataFrame Creation

In [20]:
# Create dataSet
load("data/data_AQ.rda")
load("data/meteorology.rda")

aq <- data_AQ[data_AQ$site %in% sites.lv, ] %>%
            filter(variable %in% variables.lv) %>%
            timeAverage(avg.time = "day", type=c("site", "variable"))
aq$date <- as_date(aq$date)

mto <- data_Mto[data_Mto$site %in% sites.lv, ]

df <- merge(aq, mto,
              by = c("date", "site"), all.x=T) %>%
        drop_na() %>%
        mutate_if(is.factor, as.character) %>%
        select(-date_end, -process, -summary, -validity)

rm(data_AQ)
rm(data_Mto)

In [21]:
to.predict <- df[df$date > ymd("2020-01-01"),]
data.df <- df[df$date < ymd("2020-01-01"),] 

data.df[, -c(1:3)] <- data.df[, -c(1:3)]
to.predict[, -c(1:3)] <- to.predict[, -c(1:3)]

In [22]:
ds.data <- cbind(data.df[, 1:3],
                 apply(data.df[, -c(1:3)], 2, function(cl) {
                    dcomp <- decompose(ts(cl, frequency = 365))

                    cl - dcomp$seasonal
                }))
to.predict <- cbind(to.predict[, 1:3],
                 apply(to.predict[, -c(1:3)], 2, function(cl) {
                    dcomp <- decompose(ts(cl, frequency = 365))

                    cl - dcomp$seasonal
                }))

## Cross-Validation

In [23]:
head(dat[, -1:-3])

Unnamed: 0_level_0,value,ws,wd,atmos_pres,tmed,prec,tmin,tmax,presMax,presMin,RH,solar.radiation
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4,35.85547,0.76052968,224.6837,1022.813,6.495256,-0.8585232,3.4965752,9.488465,940.3305,934.3013,79.52509,63.11348
12,98.38102,1.18413152,366.2126,1027.909,5.675967,-0.1782542,0.874478,10.370771,951.6835,947.2985,78.23889,64.68498
25,75.93004,0.84894509,102.0624,1036.546,6.378651,0.2624259,2.4498563,10.301958,956.5072,947.3278,68.60989,80.91984
42,174.72897,0.02250207,254.5703,1034.727,4.578589,0.3000775,-0.922629,10.083811,959.1826,955.4224,72.72819,109.69441
53,112.03875,0.41265013,181.8315,1030.745,5.392128,0.2813349,0.7660696,10.005517,952.1379,948.7376,79.10066,115.32039
61,37.78607,0.52790937,296.5086,1030.696,3.066759,0.2042295,1.2848465,4.747921,945.5831,943.6639,84.16489,110.22065


In [25]:
years <- c(2013, 2014, 2015, 2016,
           2017, 2018, 2019)

cv.df <- data.frame()

for (st in sites.lv) {
    data.st <- ds.data[ds.data$site == st,]
    
    for (pll in levels(as.factor(data.st$variable))) {
        dat <- data.st[data.st$variable == pll, ]
        
        for (yr in years) {
            test <- which(year(dat$date) == yr)
            train <- which(year(dat$date) != yr)

            model <- train(value ~., dat[, -1:-3], subset = train,
                           preProcess = c("center", "scale"),
                           method = "glm", family=Gamma
                          )

            y.th <- predict(model, newdata=dat[test,])

            cv.df <- rbind(cv.df,
                           data.frame("Test Year"=yr,
                                      "site"=st,
                                      "variable"=pll,
                                      "Bias"=mean(abs(y.th - dat[test,]$value),
                                                 na.rm=T),
                                      "var"=var(y.th, dat[test,]$value,
                                               na.rm=T),
                                      "Rsq"=cor(y.th, dat[test,]$value)^2
                                     ))
        }
    }
}

“model fit failed for Resample01: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample02: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample03: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample04: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample05: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample06: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' family
”
“model fit failed for Resample07: parameter=none Error in eval(family$initialize) : 
  non-positive values not allowed for the 'Gamma' f

Something is wrong; all the RMSE metric values are missing:
      RMSE        Rsquared        MAE     
 Min.   : NA   Min.   : NA   Min.   : NA  
 1st Qu.: NA   1st Qu.: NA   1st Qu.: NA  
 Median : NA   Median : NA   Median : NA  
 Mean   :NaN   Mean   :NaN   Mean   :NaN  
 3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA  
 Max.   : NA   Max.   : NA   Max.   : NA  
 NA's   :1     NA's   :1     NA's   :1    


ERROR: Error: Stopping


In [None]:
mean.cv.df <- cv.df %>%
                group_by(site, variable) %>%
                summarise(Bias=mean(Bias, na.rm=T),
                          var=mean(var, na.rm=T),
                          "Rsq"=mean(Rsq, na.rm=T),
                         ) %>%
                pivot_longer(cols = c(3:5), 
                             names_to = "Error", values_to = "Err.Val")

In [None]:
a <- cv.df %>%
        pivot_longer(cols = c(4:6), 
                     names_to = "Error", values_to = "Err.Val")

ggplot(data=a, aes(x=Error, y=Err.Val, fill=Error)) +
        geom_boxplot() + 
        facet_wrap(~variable, scales = "free_y")

## Predict

In [None]:
dataframe <- data.frame()

for (st in sites.lv) {
    data.st <- ds.data[ds.data$site == st,]
    
    for (pll in levels(as.factor(data.st$variable))) {
        dat <- data.st[data.st$variable == pll, ]
        pred <- to.predict[to.predict$site == st &
                           to.predict$variable == pll, ]

        model <- lm(value ~., data=dat[, -c(1:3)])
        y.test <- predict(model, newdata=pred[-1:-4])

        dataframe <- rbind(dataframe,
                           data.frame(site=rep(st,nrow(pred)),
                                      variable=rep(pll, nrow(pred)),
                                      date=pred$date,
                                      y.test=y.test,
                                      y.raw=pred$value
                                     ))
        rm(model)
        rm(y.test)
    }
}

In [None]:
ggplot(dataframe, aes(x=date)) +
    geom_line(aes(y=y.raw), color="blue") +
    geom_line(aes(y=y.test), color="red") +
    facet_wrap(.~variable+site, scale="free_y", ncol=4)