In [None]:
renv::load(here::here())

In [None]:
library(dplyr)
library(magrittr)
library(readr)
library(tidyr)
library(purrr)
library(stringr)
library(here)
library(scales)
library(psych)
library(caret)
library(lubridate)
library(padr)
library(tsibble)
library(feasts)
library(fabletools)
library(kableExtra)
library(ggpubr)
# These themes just look great :-)
# Download from devtools::install_github('cttobin/ggthemr')
library(ggthemr)
library(viridis)

ggthemr("fresh")

devtools::load_all(".")

library(conflicted)
conflict_prefer(name = "select", winner = "dplyr")
conflict_prefer(name = "filter", winner = "dplyr")
conflict_prefer(name = "lag", winner = "dplyr")
conflict_prefer(name = "rescale", winner = "scales")
conflict_prefer(name = "here", winner = "here")


In [None]:
load(paste0(here(), "/data/other/species.RData"))
load(paste0(here(), "/data/other/stations.RData"))

In [None]:
# Cosmo-1e with Assimilation of Real-Time Pollen Data
assim_path <- "/scratch/sadamov/wd/21_alnu_tuning_v4/"
# Cosmo-1e Baselines
cosmo_path <- "/scratch/sadamov/wd/21_alnu_tuning_v3/"

data_cosmo_hourly <- import_data_cosmo(paste0(cosmo_path, "mod_pollen_combined.txt"), type = "Cosmo")
sdes_cosmo_hourly <- import_data_cosmo(paste0(cosmo_path, "mod_sdes_combined.txt"), type = "Cosmo")
tthrs_cosmo_hourly <- import_data_cosmo(paste0(cosmo_path, "mod_tthrs_combined.txt"), type = "Cosmo")
tthre_cosmo_hourly <- import_data_cosmo(paste0(cosmo_path, "mod_tthre_combined.txt"), type = "Cosmo")
tune_cosmo_hourly <- import_data_cosmo(paste0(cosmo_path, "mod_tune_combined.txt"), type = "Cosmo")

data_assim_hourly <- import_data_cosmo(paste0(assim_path, "mod_pollen_combined.txt"), type = "Assim")
sdes_assim_hourly <- import_data_cosmo(paste0(assim_path, "mod_sdes_combined.txt"), type = "Assim")
tthrs_assim_hourly <- import_data_cosmo(paste0(assim_path, "mod_tthrs_combined.txt"), type = "Assim")
tthre_assim_hourly <- import_data_cosmo(paste0(assim_path, "mod_tthre_combined.txt"), type = "Assim")
tune_assim_hourly <- import_data_cosmo(paste0(assim_path, "mod_tune_combined.txt"), type = "Assim")

# Measured Aggregates from the DWH for the 2020 season
dwh_path <- (paste0(here::here(), "/data/dwh/pollen_dwh_hourly.txt"))
data_dwh_hourly <- import_data_dwh(dwh_path) %>%
  filter(taxon %in% unique(data_cosmo_hourly$taxon))

In [None]:
data_cosmo_daily <- aggregate_pollen(data_cosmo_hourly)
sdes_cosmo_daily <- aggregate_pollen(sdes_cosmo_hourly)
tthrs_cosmo_daily <- aggregate_pollen(tthrs_cosmo_hourly)
tthre_cosmo_daily <- aggregate_pollen(tthre_cosmo_hourly)
tune_cosmo_daily <- aggregate_pollen(tune_cosmo_hourly)

data_assim_daily <- aggregate_pollen(data_assim_hourly, level = "12h")
sdes_assim_daily <- aggregate_pollen(sdes_assim_hourly, level = "12h")
tthrs_assim_daily <- aggregate_pollen(tthrs_assim_hourly, level = "12h")
tthre_assim_daily <- aggregate_pollen(tthre_assim_hourly, level = "12h")
tune_assim_daily <- aggregate_pollen(tune_assim_hourly, level = "12h")

data_dwh_daily <- aggregate_pollen(data_dwh_hourly)

In [None]:
gg_data <- data_cosmo_daily %>%
  bind_rows(data_assim_daily) %>%
  filter(station == "Zürich") %>%
  ggplot(aes(x = date, y = value, col = type)) +
  scale_colour_manual("", values = swatch()[c(2, 3)]) +
  ggtitle("Daily average pollen concentrations") +
  geom_line() 

gg_sdes <- sdes_cosmo_daily %>%
  bind_rows(sdes_assim_daily) %>%
  filter(station == "Zürich") %>%
  ggplot(aes(x = date, y = value, col = type)) +
  scale_colour_manual("", values = swatch()[c(2, 3)]) +
  ggtitle("Daily average season description") +
  geom_line() 

gg_tthrs <- tthrs_cosmo_daily %>%
  bind_rows(tthrs_assim_daily) %>%
  filter(station == "Zürich") %>%
  ggplot(aes(x = date, y = value, col = type)) +
  scale_colour_manual("", values = swatch()[c(2, 3)]) +
  ggtitle("Daily average temperature sum threshold - start") +
  geom_line() 

gg_tthre <- tthre_cosmo_daily %>%
  bind_rows(tthre_assim_daily) %>%
  filter(station == "Zürich") %>%
  ggplot(aes(x = date, y = value, col = type)) +
  scale_colour_manual("", values = swatch()[c(2, 3)]) +
  ggtitle("Daily average temperature sum threshold - end") +
  geom_line() 

gg_tune <- tune_cosmo_daily %>%
  bind_rows(tune_assim_daily) %>%
  filter(station == "Zürich") %>%
  ggplot(aes(x = date, y = value, col = type)) +
  scale_colour_manual("", values = swatch()[c(2, 3)]) +
  ggtitle("Daily average tuning factor") +
  geom_line() 


ggarrange(gg_data, gg_sdes, gg_tthrs, gg_tthre, gg_tune, ncol = 1)

In [None]:
data_dwh_hourly_imp <- impute_pollen(
  data_dwh_hourly,
  min_date = min(data_assim_hourly$datetime),
  max_date = max(data_assim_hourly$datetime),
  taxon = unique(data_assim_hourly$taxon)
)
data_dwh_daily_imp <- impute_pollen(
  data_dwh_daily,
  min_date = min(data_assim_daily$datetime),
  max_date = max(data_assim_daily$datetime),
  taxon = unique(data_assim_daily$taxon)
)
data_cosmo_hourly_imp <- impute_pollen(
  data_cosmo_hourly,
  min_date = min(data_assim_hourly$datetime),
  max_date = max(data_assim_hourly$datetime),
  taxon = unique(data_assim_hourly$taxon)
)
data_cosmo_daily_imp <- impute_pollen(
  data_cosmo_daily,
  min_date = min(data_assim_daily$datetime),
  max_date = max(data_assim_daily$datetime),
  taxon = unique(data_assim_daily$taxon)
)
data_assim_hourly_imp <- impute_pollen(data_assim_hourly)
data_assim_daily_imp <- impute_pollen(data_assim_daily)

data_comb_daily <- data_dwh_daily_imp %>%
  bind_rows(data_cosmo_daily_imp) %>%
  bind_rows(data_assim_daily_imp) %>%
  filter(measurement == "concentration")

data_comb_hourly <- data_dwh_hourly_imp %>%
  bind_rows(data_cosmo_hourly_imp) %>%
  bind_rows(data_assim_hourly_imp) %>%
  filter(measurement == "concentration")

taxa_selected <- unique(data_assim_hourly$taxon)


In [None]:
map(stations$station, ~ plot_comb(
  data_plot = data_comb_daily,
  taxon = taxa_selected,
  station = .x,
  resolution = "Daily",
  combined = TRUE,
  rm_zeros = FALSE,
  plot_dwh = TRUE
))


In [None]:

data_comb_daily %>%
  group_by(taxon, station, type) %>%
  summarise(value = sum(value != 0)) %>%
  ungroup() %>%
  mutate(metric = "occurence")

data_comb_daily %>%
  group_by(taxon, station, type) %>%
  summarise(value = max(value)) %>%
  ungroup() %>%
  mutate(metric = "maximum")

data_comb_daily %>% # This would need a season definition
  group_by(taxon, station, type) %>%
  summarise(value = mean(value)) %>%
  ungroup() %>%
  mutate(metric = "average")

data_comb_daily %>%
  group_by(taxon, station, type) %>%
  summarise(value = sum(value)) %>%
  ungroup() %>%
  mutate(metric = "spi")



In [None]:
data_verif <- data_comb_daily %>%
  mutate(value = if_else(type == "Assim", value * 0.5, value)) %>%
  filter(type != "Cosmo")

In [None]:
# Check stations tibble for all available locations
stations_selected <- "Basel"
gg_timeseries <- list()

for (s in stations_selected %>% sort()) {
  data_timeseries <- data_verif %>%
    filter(
      taxon %in% taxa_selected,
      station %in% s
    )

  large_diffs <- data_timeseries %>%
    pivot_wider(names_from = type) %>%
    mutate(
      diff = Measurements - Assim,
      mean_pol = (Measurements + Assim) / 2,
      large_diff = if_else(Measurements > 20 & ((abs(diff) + mean_pol) / mean_pol > 2),
        date,
        as.Date(NA)
      )
    ) %>%
    pull(large_diff) %>%
    unique()

  gg_timeseries[[s]] <- data_timeseries %>%
    ggplot(aes(x = datetime, y = value, col = type, lty = type)) +
    geom_line(alpha = 0.6) +
    geom_vline(
      xintercept = large_diffs,
      alpha = 0.1,
      col = swatch()[1], size = 1
    ) +
    ggtitle(paste(
      "Hourly", paste(taxa_selected, collapse = ", "),
      "Pollen Concentrations in",
      s
    )) +
    labs(
      y = expression(paste("Pollen Concentration [", m^-3, "]")),
      x = "2020 Season"
    ) +
    scale_colour_manual("Data", values = swatch()[c(2, 4)]) +
    guides(lty = FALSE)
}

gg_timeseries

In [None]:

methods <- c("pearson", "spearman", "kendall")

data_corr <- data_verif %>%
  filter(measurement == "concentration") %>%
  # For the robust methods transformation doesn't matter
  mutate(value = log10(value + 1)) %>%
  select(value, type, datetime, station, taxon) %>%
  pivot_wider(names_from = type, values_from = value)

data_corr_exp <- data_verif %>%
  filter(measurement == "concentration") %>%
  select(value, type, datetime, station, taxon) %>%
  pivot_wider(names_from = type, values_from = value)

corr_matrix <- map(methods, ~ corr.test(
  data_corr %>% select(-datetime, -station, -taxon),
  # use = "complete", # There should be only complete observations
  method = .x,
  # adjust = "holm",
  alpha = .05,
  ci = TRUE,
  minlength = 5
))

In [None]:

max_val <- max(data_corr_exp$Assim, na.rm = TRUE)
# max_val <- 400 # Used to overwrite the default

ggthemr("fresh")
ci <- map(corr_matrix, ~ .x %>%
  pluck(10)) %>%
  bind_rows() %>%
  round(2) %>%
  mutate(
    method = methods,
    metric = c("R-", "rho-", "tau-"),
    ci = tools::toTitleCase(paste0(
      metric,
      method,
      ": ",
      lower.adj,
      " - ",
      upper.adj
    )),
    x = rep(max_val * 0.1, times = 3),
    y = c(max_val, max_val * 0.95, max_val * 0.9)
  )

gg_corr <- data_corr_exp %>%
  ggplot(aes(x = Measurements, y = Assim)) +
  geom_point(alpha = 0.3) +
  # The smoother is dependent on the scale
  geom_smooth(alpha = 0.1) +
  geom_abline(slope = 1, intercept = 0, col = swatch()[4]) +
  geom_label(data = ci, aes(label = ci, x = x, y = y), parse = TRUE) +
  scale_x_continuous(
    name = "Measured Pollen Concentration [m⁻³]",
    limits = c(0, max_val)
  ) +
  scale_y_continuous(
    name = "Modelled Pollen Concentration [m⁻³]",
    limits = c(0, max_val)
  )

title <- tools::toTitleCase(paste0(
  "Comparison of Daily average",
  " concentrations of measured and modelled pollen concentrations at all stations"
))

gg_corr <- ggarrange(gg_corr) %>%
  annotate_figure(
    top = title,
    bottom = text_grob(paste0("Pairwise correlation between measured and ",
      "modelled data; grey line shows the Loess smother; the red line shows",
      "a theroratical perfect correlation of 1. \n In the text box one can ",
      "see the 95% confidence intervals of the R-values (adjusted for multiple ",
      "comparison) as obtained by Pearson and two robust methods."),
      color = swatch()[1],
      face = "italic",
      size = 10
    )
)


gg_corr


In [None]:
ggsave(
  filename = paste0(here::here(), "/vignettes/figures/corr_plot.png"),
  gg_corr,
  width = 12,
  height = 8
)

In [None]:
data_altman <- data_corr %>%
  mutate(
    mean = if_else(!is.na(Measurements) | !is.na(Assim),
      rowSums(.[4:5], na.rm = TRUE) / 2,
      NA_real_
    ),
    diff = Measurements - Assim
  )

data_altman_exp <- data_corr_exp %>%
  mutate(
    mean = if_else(!is.na(Measurements) | !is.na(Assim),
      rowSums(.[4:5], na.rm = TRUE) / 2,
      NA_real_
    ),
    diff = Measurements - Assim
  )

In [None]:

categs <- c("weak", "medium", "strong", "verystrong")

data_conc <- data_altman_exp %>%
  filter(mean >= 1) %>%
  mutate(conc = case_when(
    taxon == "Alnus" & mean >= 1 & mean <= 10 ~ "weak",
    taxon == "Alnus" & mean >= 11 & mean <= 69 ~ "medium",
    taxon == "Alnus" & mean >= 70 & mean <= 249 ~ "strong",
    taxon == "Alnus" & mean >= 250 ~ "verystrong",
    taxon == "Betula" & mean >= 1 & mean <= 10 ~ "weak",
    taxon == "Betula" & mean >= 11 & mean <= 69 ~ "medium",
    taxon == "Betula" & mean >= 70 & mean <= 299 ~ "strong",
    taxon == "Betula" & mean >= 300 ~ "verystrong",
    taxon == "Poaceae" & mean >= 1 & mean <= 19 ~ "weak",
    taxon == "Poaceae" & mean >= 20 & mean <= 49 ~ "medium",
    taxon == "Poaceae" & mean >= 50 & mean <= 149 ~ "strong",
    taxon == "Poaceae" & mean >= 150 ~ "verystrong",
    taxon == "Ambrosia" & mean >= 1 & mean <= 5 ~ "weak",
    taxon == "Ambrosia" & mean >= 6 & mean <= 10 ~ "medium",
    taxon == "Ambrosia" & mean >= 11 & mean <= 39 ~ "strong",
    taxon == "Ambrosia" & mean >= 40 ~ "verystrong"
  )) %>%
  pivot_longer(Measurements:Assim, names_to = "type", values_to = "value") %>%
  pivot_wider(names_from = conc, values_from = value) %>%
  mutate(type = if_else(type == "Assim", "Cosmo", type))

gg_conc_dens <- list()

labels_y <- list(0.1, 0.015, 0.004, 0.002)
# For 2-hours labels_y <- list(0.75, 0.03, 0.015, 0.010, 0.005, 0.0012)
labels_y_hist <- list(15, 13, 10, 10, 7.5, 3)
names(labels_y) <- categs

for (j in categs) {
  if (j %in% names(data_conc)) {
    obs <- data_conc %>%
      filter(!is.na(!!sym(j))) %>%
      summarise(n() / 2) %>%
      pull()
    obs <- paste("# of Observations:", obs)

    gg_conc_dens[[j]] <- data_conc %>%
      filter(!is.na(!!sym(j))) %>%
      ggplot() +
      # The area under that whole curve should be 1.
      # To get an estimate of the probability of certain values,
      # you'd have to integrate over an interval on your 'y' axis,
      # and that value should never be greater than 1.
      geom_density(aes(x = !!sym(j), col = type, fill = type), alpha = 0.15) +
      geom_label(label = obs, aes(x = max(!!sym(j)) * 0.7), y = labels_y[[j]]) +
      scale_colour_manual("", values = swatch()[c(2, 4)]) +
      scale_fill_manual(values = swatch()[c(2, 4)]) +
      coord_cartesian(xlim = c(0, NA)) +
      guides(fill = FALSE)
  }
}

gg_dens_conc <- ggarrange(plotlist = gg_conc_dens) %>%
  annotate_figure(
    top = paste(
      "Comparison of Measurements and Model Predictions",
      "for All Stations and Different Concentration Groups."
    ),
    bottom = text_grob(paste0("We are looking at Density Kernel Estimators ",
      "for all three traps to compare the measurements between them. ",
      "\n The area under each curve adds up to 1 and makes it possible ",
      "to vizualise the (dis-)similarities of measurements from the ",
      "three traps. \n It is basically a smoothed histogram. ",
      "The buckets are based on the mean concentrations of measurements and model."),
      color = swatch()[1],
      face = "italic",
      size = 10
    )
  )

gg_dens_conc


In [None]:
ggsave(
  filename = paste0(here::here(), "/vignettes/figures/density_plot.png"),
  gg_dens_conc,
  width = 12,
  height = 8
)

In [None]:

data_conc_plot <- data_conc %>%
  pivot_longer(categs[categs %in% names(data_conc)],
    names_to = "group",
    values_to = "value"
  ) %>%
  mutate(
    group = factor(group, levels = categs[categs %in% names(data_conc)]),
    reldiff = diff / mean
  ) %>%
  filter(!is.na(value))

means <- data_conc_plot %>%
  group_by(group, type) %>%
  summarise(mean = mean(reldiff))

gg_boxplot_reldiff <- data_conc_plot %>%
  ggplot(aes(x = group, y = reldiff)) +
  geom_boxplot(alpha = 0.6) +
  geom_point(
    data = means, aes(x = group, y = mean),
    position = position_dodge(width = 0.75),
    shape = 95,
    size = 10,
    show.legend = FALSE
  ) +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "",
    y = "(Measurements - Assim) / mean(Measurements, Assim)",
    title = paste("Relative Pairwise Differences for",
    "Daily Average Pollen-Concentrations")
  ) +
  facet_wrap(~taxon)

gg_boxplot_reldiff


In [None]:
ggsave(
  filename = paste0(here::here(), "/vignettes/figures/diff_boxplot.png"),
  gg_boxplot_reldiff,
  width = 12,
  height = 8
)

In [None]:
diff_selection <- taxa_selected

kable_diff <- map(
  diff_selection,
  ~ data_conc_plot %>%
    filter(taxon %in% .x) %>%
    group_by(group, type) %>%
    summarise(
      q25 = quantile(reldiff, probs = 0.25, na.rm = TRUE),
      median = median(reldiff, na.rm = TRUE),
      q75 = quantile(reldiff, probs = 0.75, na.rm = TRUE)
    ) %>%
    ungroup() %>%
    group_by(group) %>%
    summarise_at(vars(q25, median, q75), ~ mean(.)) %>%
    ungroup()
) %>%
  bind_rows()

taxon <- data_conc_plot %>%
  group_by(group) %>%
  summarise(taxon = unique(taxon)) %>%
  select(taxon) %>%
  arrange(taxon) %>%
  ungroup

kable_diff %>%
  bind_cols(taxon) %>%
  mutate_at(
    vars(q25, median, q75),
    ~ scales::percent(signif(., 3),
      accuracy = 0.01
    )
  ) %>%
  select(taxon, everything()) %>%
  setNames(c("Species", "Class", "25%-Quantile", "Median", "75%-Quantile")) %>%
  kable(escape = FALSE, align = c("c", "c", "r", "r", "r")) %>%
  kable_styling("striped", full_width = FALSE) %>%
  collapse_rows(columns = 1)


In [None]:

sd_diff <- data_altman %>%
  summarise(sd_diff = sd(diff, na.rm = TRUE)) %>%
  pull(sd_diff)

gg_ab1 <- data_altman %>%
  filter(!is.na(diff)) %>%
  ggplot(aes(x = mean, y = diff)) +
  geom_point(alpha = 0.5) +
  coord_cartesian(
    xlim = c(
      min(data_altman$mean),
      max(data_altman$mean)
    ),
    ylim = c(
      -sd_diff[1] * 4,
      sd_diff[1] * 4
    )
  ) +
  geom_abline(slope = 0, intercept = 0, col = swatch()[4], alpha = 0.8) +
  geom_abline(
    slope = 0,
    intercept = sd_diff[1] * 2,
    col = swatch()[4],
    alpha = 0.8,
    linetype = 3
  ) +
  geom_abline(
    slope = 0,
    intercept = sd_diff[1] * (-2),
    col = swatch()[4],
    alpha = 0.8,
    linetype = 3
  ) +
  geom_smooth(alpha = 0.1) +
  labs(y = "Difference(Measurements - Cosmo)", x = "Mean(Measurements, Cosmo)") +
  facet_wrap(~taxon)

title <- paste0(
  "Altman-Bland Plot to Compare ",
  "Measured and Modelled Pollen Concentrations for All Stations"
)

gg_altman <- ggarrange(gg_ab1) %>%
  annotate_figure(top = title, bottom = text_grob(paste0(
    "Pairwise comparison of traps; grey line shows the Loess smother; ",
    "the red line shows a theroratical perfect agreement between two ",
    "timeseries. \n The dashed red line shows the 2 * sd of the ",
    "differences, where we expect the points to lie within."
  ),
  color = swatch()[1],
  face = "italic",
  size = 12
  ))

gg_altman

In [None]:
ggsave(
  filename = paste0(
    here::here(),
    "/vignettes/figures/altman-bland.png"
  ),
  gg_altman,
  width = 12,
  height = 8
)

In [None]:
data_conc_plot

In [None]:
data_conc_plot %>%
  pivot_wider(names_from = type, values_from = value) %>%
  summarise(
    R2 = cor(Cosmo, Measurements, use = "complete.obs")^2,
    MSE = mean((Cosmo - Measurements)^2, na.rm = TRUE),
    RMSE = sqrt(MSE),
    MAE = mean(abs(Cosmo - Measurements), na.rm = TRUE)
  )

In [None]:
data_valid <- data_altman_exp %>%
  mutate(
    conc_measurements = case_when(
      taxon == "Alnus" & Measurements >= 1 & Measurements <= 10 ~ "weak",
      taxon == "Alnus" & Measurements >= 11 & Measurements <= 69 ~ "medium",
      taxon == "Alnus" & Measurements >= 70 & Measurements <= 249 ~ "strong",
      taxon == "Alnus" & Measurements >= 250 ~ "verystrong",
      taxon == "Betula" & Measurements >= 1 & Measurements <= 10 ~ "weak",
      taxon == "Betula" & Measurements >= 11 & Measurements <= 69 ~ "medium",
      taxon == "Betula" & Measurements >= 70 & Measurements <= 299 ~ "strong",
      taxon == "Betula" & Measurements >= 300 ~ "verystrong",
      taxon == "Poaceae" & Measurements >= 1 & Measurements <= 19 ~ "weak",
      taxon == "Poaceae" & Measurements >= 20 & Measurements <= 49 ~ "medium",
      taxon == "Poaceae" & Measurements >= 50 & Measurements <= 149 ~ "strong",
      taxon == "Poaceae" & Measurements >= 150 ~ "verystrong",
      taxon == "Ambrosia" & Measurements >= 1 & Measurements <= 5 ~ "weak",
      taxon == "Ambrosia" & Measurements >= 6 & Measurements <= 10 ~ "medium",
      taxon == "Ambrosia" & Measurements >= 11 & Measurements <= 39 ~ "strong",
      taxon == "Ambrosia" & Measurements >= 40 ~ "verystrong"
    ),
    conc_assim = case_when(
      taxon == "Alnus" & Assim >= 1 & Assim <= 10 ~ "weak",
      taxon == "Alnus" & Assim >= 11 & Assim <= 69 ~ "medium",
      taxon == "Alnus" & Assim >= 70 & Assim <= 249 ~ "strong",
      taxon == "Alnus" & Assim >= 250 ~ "verystrong",
      taxon == "Betula" & Assim >= 1 & Assim <= 10 ~ "weak",
      taxon == "Betula" & Assim >= 11 & Assim <= 69 ~ "medium",
      taxon == "Betula" & Assim >= 70 & Assim <= 299 ~ "strong",
      taxon == "Betula" & Assim >= 300 ~ "verystrong",
      taxon == "Poaceae" & Assim >= 1 & Assim <= 19 ~ "weak",
      taxon == "Poaceae" & Assim >= 20 & Assim <= 49 ~ "medium",
      taxon == "Poaceae" & Assim >= 50 & Assim <= 149 ~ "strong",
      taxon == "Poaceae" & Assim >= 150 ~ "verystrong",
      taxon == "Ambrosia" & Assim >= 1 & Assim <= 5 ~ "weak",
      taxon == "Ambrosia" & Assim >= 6 & Assim <= 10 ~ "medium",
      taxon == "Ambrosia" & Assim >= 11 & Assim <= 39 ~ "strong",
      taxon == "Ambrosia" & Assim >= 40 ~ "verystrong"
    )
  ) %>%
  filter(
    !is.na(conc_measurements),
    !is.na(conc_assim)
  ) %>%
  mutate_at(
    vars(conc_measurements, conc_assim),
    ~ factor(., levels = c("weak", "medium", "strong", "verystrong"))
  )

confusionMatrix(data_valid$conc_assim, data_valid$conc_measurements)