# Setup

In [107]:
library(DMwR)
library(knitr)
library(lattice)
library(latticeExtra)

# Data Description
## Overview
The data contain the following variables

* Season in which the measurements were collected.
* River size.
* River speed.
* Max pH value.
* Min oxygen level.
* Mean over three measurements
  * Chloride.
  * Nitrates.
  * Ammonium.
  * Orthophosphate.
  * Phosphate.
  * Chlorophyll.

Associated with each are seven frequencies of harmful algae, per water sample.

## Summary

### Head

In [None]:
head(algae)

### Summary Stats

In [None]:
kable(summary(algae))

## Variable Distributions

In [110]:
plot_variable <- function(var_name) {
  var <- algae[[var_name]]
  par(mfrow = c(1, 3), bg = "white")

  plot(density(var, na.rm = TRUE), main = "Density Plot", xlab = var_name)
  abline(v = mean(var, na.rm = TRUE), col = "blue", lty = 2)

  qqnorm(var, main = "Q-Q Plot")
  qqline(var, col = "red")

  boxplot(var, horizontal = TRUE, main = "Boxplot", xlab = var_name)
  abline(v = mean(var, na.rm = TRUE), col = "blue", lty = 2)

  par(mfrow = c(1, 1))
}

### Max pH

In [None]:
#| warning: false
plot_variable("mxPH")

### Min Oxygen

In [None]:
#| warning: false
plot_variable("mnO2")

### Mean Chloride

In [None]:
#| warning: false
plot_variable("Cl")

### Mean Nitrates

In [None]:
#| warning: false
plot_variable("NO3")

### Mean Ammonium

In [None]:
#| warning: false
plot_variable("NH4")

### Mean Orthophosphate

In [None]:
#| warning: false
plot_variable("oPO4")

### Mean Phosphate

In [None]:
#| warning: false
plot_variable("PO4")

### Mean Chlorophyll

In [None]:
#| warning: false
plot_variable("Chla")

# Conditioned Plots

In [119]:
plot_conditioned <- function(x_var_name, y_var_name, use_lattice = TRUE) {
  x_data <- algae[[x_var_name]]
  y_data <- algae[[y_var_name]]
  name <- paste(y_var_name, x_var_name, sep = " vs ")

  if (use_lattice) {
    bwplot(x_data ~ y_data,
      data = algae,
      xlab = y_var_name,
      ylab = x_var_name,
      main = name)
  } else {
    boxplot(y_data ~ x_data,
      horizontal = TRUE,
      xlab = x_var_name,
      ylab = y_var_name,
      main = name,
      outline = FALSE
    )

    rug(jitter(as.numeric(y_data)), side = 1)
    rug(jitter(as.numeric(x_data)), side = 2)
  }
}

## Algal a1

In [None]:
plot_conditioned("size", "a1")

## Algal a3 vs minO2 over the Seasons

In [None]:
minO2 <- equal.count(na.omit(algae$mnO2), number = 4, overlap = 1/5)

 # Note the | syntax is lattice specific
stripplot(season ~ a3 | minO2, data = algae[!is.na(algae$mnO2),])

# Imputation

## Count Rows With Missing Values

In [None]:
rows_with_na <- algae[!complete.cases(algae),]
rows_with_na

In [None]:
nrow(rows_with_na)

## Counting NA's Per Row

In [124]:
count_na_per_row <- function(data) {
  apply(data, 1, function(d){
    sum(is.na(d))
  })
}

In [125]:
na_row_counts <- count_na_per_row(algae)

In [None]:
algae[which(na_row_counts > 2), ]

## Central Tendency Imputation
First, making a copy of the dataset and defining some imputation functions.

In [127]:
algae_central <- data.frame(algae)

impute_mean <- function(ds, var_name) {
  ds[is.na(ds[[var_name]]), var_name] <- mean(ds[[var_name]], na.rm = TRUE)
  ds
}

impute_median <- function(ds, var_name) {
  ds[is.na(ds[[var_name]]), var_name] <- median(ds[[var_name]], na.rm = TRUE)
  ds
}

### Max pH
As the maximum pH variable is normally distributed, the mean can be used for imputation.

In [128]:
algae_central <- impute_mean(algae_central, "mxPH")

### Mean Chlorophyll
As the mean chlorophyll variable is very non-normal, skewed to the left and has outliers, the mean is a poorer choice of imputation method, so the median is used instead.

In [129]:
algae_central <- impute_median(algae_central, "Chla")