In [None]:
library(tensorflow)
library(keras)
mnist <- dataset_mnist()
train_images <- mnist$train$x 
train_labels <- mnist$train$y
test_images <- mnist$test$x
test_labels <- mnist$test$y

model <- keras_model_sequential(list(
 layer_dense(units = 512, activation = "relu"),
 layer_dense(units = 10, activation = "softmax")
))
compile(model,
       optimizer = "rmsprop",
       loss = "sparse_categorical_crossentropy",
       metrics = "accuracy")

train_images <- array_reshape(train_images, c(60000, 28 * 28))
train_images <- train_images / 255
test_images <- array_reshape(test_images, c(10000, 28 * 28))
test_images <- test_images / 255

history <- fit(model, train_images, train_labels, epochs = 5, batch_size = 128)
plot(history)
test_digits <- test_images[1:10, ]
predictions <- predict(model, test_digits)
predictions

In [5]:
library(tensorflow)
library(keras)

layer_naive_dense <- function(input_size, output_size, activation) {
  self <- new.env(parent = emptyenv())
  attr(self, "class") <- "NaiveDense"

  self$activation <- activation

  w_shape <- c(input_size, output_size)
  w_initial_value <- array(rnorm(input_size * output_size, mean = 0, sd = 1e-1), dim = w_shape)

  self$W <- tf$Variable(w_initial_value)

  b_shape <- c(output_size)
  b_initial_value <- array(0, b_shape) 
  self$b <- tf$Variable(b_initial_value)

  self$weights <- list(self$W, self$b)

  self$call <- function(inputs) {
    self$activation(tf$matmul(inputs, self$W) + self$b)
  }

  self
}

naive_model_sequential <- function(layers) {
  self <- new.env(parent = emptyenv())
  attr(self, "class") <- "NaiveSequential"

  self$layers <- layers

  weights <- lapply(layers, function(layer) layer$weights)
  self$weights <- do.call(c, weights)

  self$call <- function(inputs) {
    x <- inputs
    for (layer in self$layers)
    x <- layer$call(x)
    x
  }

  self
}
                    
model <- naive_model_sequential(list(
 layer_naive_dense(input_size = 28 * 28, output_size = 512,
                   activation = tf$nn$relu),
 layer_naive_dense(input_size = 512, output_size = 10,
                   activation = tf$nn$softmax)
))
stopifnot(length(model$weights) == 4)
                    
new_batch_generator <- function(images, labels, batch_size = 128) {
  self <- new.env(parent = emptyenv())
  attr(self, "class") <- "BatchGenerator"

  stopifnot(nrow(images) == nrow(labels))
  self$index <- 1
  self$images <- images
  self$labels <- labels
  self$batch_size <- batch_size
  self$num_batches <- ceiling(nrow(images) / batch_size)

  self$get_next_batch <- function() {
    start <- self$index
    if(start > nrow(images))
      return(NULL)

    end <- start + self$batch_size - 1
    if(end > nrow(images))
      end <- nrow(images)

    self$index <- end + 1
    indices <- start:end
    list(images = self$images[indices, ],
      labels = self$labels[indices])
  }

  self
}
                    
learning_rate <- 1e-3

update_weights <- function(gradients, weights) {
  stopifnot(length(gradients) == length(weights))
  for (i in seq_along(weights))
    weights[[i]]$assign_sub(
      gradients[[i]] * learning_rate)
}
                    
optimizer <- optimizer_sgd(learning_rate = 1e-3)

zip_lists <- function(a, b) {
    stopifnot(length(a) == length(b))
    Map(c, a, b)
}                    
                    
update_weights <- function(gradients, weights) 
    optimizer$apply_gradients(zip_lists(gradients, weights))

one_training_step <- function(model, images_batch, labels_batch) {
    with(tf$GradientTape() %as% tape, {
        predictions <- model$call(images_batch)
        per_sample_losses <- loss_sparse_categorical_crossentropy(labels_batch, predictions)
        average_loss <- mean(per_sample_losses)
    })
    gradients <- tape$gradient(average_loss, model$weights)
    update_weights(gradients, model$weights)
    average_loss
}
                    
fit <- function(model, images, labels, epochs, batch_size = 128) {
 for (epoch_counter in seq_len(epochs)) {
   cat("Epoch ", epoch_counter, "\n")
   batch_generator <- new_batch_generator(images, labels)
   for (batch_counter in seq_len(batch_generator$num_batches)) {
     batch <- batch_generator$get_next_batch()
     loss <- one_training_step(model, batch$images, batch$labels)
     if (batch_counter %% 100 == 0)
         print(paste(
         "loss at batch ", loss, " counter", batch_counter 
         ))
   }
 }
}
                    
                    
mnist <- dataset_mnist()
train_images <- array_reshape(mnist$train$x, c(60000, 28 * 28)) / 255
test_images <- array_reshape(mnist$test$x, c(10000, 28 * 28)) / 255
test_labels <- mnist$test$y
train_labels <- mnist$train$y
fit(model, train_images, train_labels, epochs = 20, batch_size = 128)
                    
predictions <- as.array(model$call(test_images))
pred_labels <- max.col(predictions) - 1
matches <- pred_labels == test_labels
print(paste("accuracy: ", mean(matches) ) )

Epoch  1 
[1] "loss at batch  2.3132080963572  counter 100"
[1] "loss at batch  1.96473986034839  counter 200"
[1] "loss at batch  1.8407879928024  counter 300"
[1] "loss at batch  1.54412988496258  counter 400"
Epoch  2 
[1] "loss at batch  1.40883322402089  counter 100"
[1] "loss at batch  1.11925147592799  counter 200"
[1] "loss at batch  1.25245706617174  counter 300"
[1] "loss at batch  1.03790369219984  counter 400"
Epoch  3 
[1] "loss at batch  1.05048981761004  counter 100"
[1] "loss at batch  0.803862034206925  counter 200"
[1] "loss at batch  0.996266653528907  counter 300"
[1] "loss at batch  0.812011601747603  counter 400"
Epoch  4 
[1] "loss at batch  0.866955316726097  counter 100"
[1] "loss at batch  0.65152636524896  counter 200"
[1] "loss at batch  0.855192180455102  counter 300"
[1] "loss at batch  0.686102490085561  counter 400"
Epoch  5 
[1] "loss at batch  0.756903824467251  counter 100"
[1] "loss at batch  0.56394397827812  counter 200"
[1] "loss at batch  0.76439

# Binary classification

- Based on [Essential math for DS](https://www.oreilly.com/library/view/essential-math-for/9781098102920/)

In [None]:
# Import data
library(rsample)

df <- read.csv("https://tinyurl.com/y2qmhfsr")
knitr::kable(head(df))
split <- initial_split(df, prop = 3/4)
train_data <- training(split)
test_data <- testing(split)

x_train <- train_data[, 1:3] /255
x_test <- test_data[, 1:3] /255
y_train <- train_data[, 4]
y_test <- test_data[, 4]
n <- dim(x_train)[1]

x_train <- as.matrix(x_train)
x_test <- as.matrix(x_test)

In [None]:
# Forward propagation
w_hidden <- matrix(runif(9), nrow = 3, ncol = 3)
w_output <- matrix(runif(3), nrow = 1, ncol = 3)

b_hidden <- matrix(runif(3), nrow = 3, ncol = 1)
b_output <- matrix(runif(1), nrow = 1, ncol = 1)

relu <- function(x) {
    for(i in seq_along(x)) {
        if(x[i] < 0) {
            x[i] = 0
        }
    }
    return(x)
}

logistic <- function(x) {
    1 / (1 + exp(-x))
}

forward_prop <- function(x) {
    x <- t(x)
    z1 <- w_hidden %*% x + b_hidden[,1]
    a1 <- relu(z1)
    z2 <- w_output %*% a1 + b_output[1, 1]
    a2 <- logistic(z2)
    return(list(z1, a1, z2, a2))
}

test_pred <- forward_prop(x_test)[[4]]
test_comparison <- ifelse(ifelse(test_pred > 0.5, 1, 0) == y_test, 1, 0)
accuracy <- sum(test_comparison) / length(y_test)
accuracy

In [None]:
# Backward propagation
backward_prop <- function(z1, a1, z2, a2, x, y) {
  x <- as.matrix(x)
  dc_da2 <- 2*a2 - 2*y
  da2_dz2 <- exp(-z2) / (1 + exp(-z2))^2
  dz2_da1 <- w_output
  dz2_dw2 <- a1
  dz2_db2 <- 1
  da1_dz1 <- z1 > 0
  dz1_dw1 <- x
  dz1_db1 <- 1
  
  dc_dw2 <- (dc_da2 %*% t(da2_dz2))[1,1] * dz2_dw2
  dc_db2 <- dc_da2 %*% t(da2_dz2) * dz2_db2
  dc_da1 <- dc_da2 %*% t(da2_dz2) %*% dz2_da1
  dc_dw1 <- (dc_da1 %*% da1_dz1)[1, 1] * t(dz1_dw1)
  dc_db1 <- dc_da1 %*% da1_dz1 * dz1_db1
  return(list(dc_dw2, dc_db1, dc_dw2, dc_db2))
}

res <- forward_prop(x_test)
z1 <- res[[1]]
a1 <- res[[2]]
z2 <- res[[3]]
a2 <- res[[4]]
test <- backward_prop(z1, a1, z2, a2, x_test[1, ], y_test[1])

In [None]:
L <- 0.05
for(i in seq(1, 100000)) {
    idx <- sample(1:n, 1, replace = FALSE)
    x_sample <- t(x_train[idx, ])
    y_sample <- y_train[idx]
    res <- forward_prop(x_sample)
    z1 <- res[[1]]
    a1 <- res[[2]]
    z2 <- res[[3]]
    a2 <- res[[4]]
    res <- backward_prop(z1, a1, z2, a2, x_sample, y_sample)
    dw1 <- res[[1]]
    db1 <- res[[2]]
    dw2 <- res[[3]]
    db2 <- res[[4]]
    w_hidden <- w_hidden - L * dw1[,1]
    b_hidden <- b_hidden - L * db1[1,1]
    w_output <- w_output - L * dw2[,1]
    b_output <- b_output - L * db2[1,1]
}

In [None]:
test_pred <- forward_prop(x_test)[[4]]
test_comparison <- ifelse(ifelse(test_pred > 0.5, 1, 0) == y_test, 1, 0)
accuracy <- sum(test_comparison) / length(y_test)
accuracy

# Predict iris species

- this is really not working.
- Maybe there is somewhere a bug

In [None]:
rm(list = ls())
pd <- function(x) {
  d <- dim(x)
  if(!is.null(d)) {
    print(d)
  } else {
    print(length(x))
  }
}

relu <- function(x) {
  for(i in seq_along(x)) {
    if(x[i] < 0) {
      x[i] = 0
    }
  }
  return(x)
}

relu_derivative <- function(x) {
  x[x <= 0] <- 0
  x[x > 0] <- 1
  return(x)
}

softmax <- function(x) {
  exp(x) / sum(exp(x))
}

softmax_derivative <- function(x) {
  p <- softmax(x)
  diag(p) - tcrossprod(p)
}

# Predict iris species
df <- iris
n <- dim(df)[1]
df <- df[sample(1:n, n, replace = FALSE),]
train_data <- df[1:round(n*0.75), ]
test_data <- df[round(n*0.75):n, ]

x_train <- train_data[, 1:4]
x_test <- test_data[, 1:4]
means <- apply(x_train, 2, mean)
sds <- apply(x_train, 2, sd)
for(i in 1:4) {
  x_train[, i] <- (x_train[, i] - means[i]) / sds[i]
  x_test[, i] <- (x_test[, i] - means[i]) / sds[i]
}

y_train <- train_data[, 5] 
y_test <- test_data[, 5]

x_train <- as.matrix(x_train)
x_test <- as.matrix(x_test)
y_train <- sapply(y_train, function(x) {
  if(x == "setosa") {
    return(c(1, 0, 0))
  } else if(x == "versicolor") {
    return(c(0, 1, 0))
  } else {
    return(c(0, 0, 1))
  }
})
y_test <- sapply(y_test, function(x) {
  if(x == "setosa") {
    return(0)
  } else if(x == "versicolor") {
    return(1)
  } else {
    return(2)
  }
})

w_hidden <- matrix(runif(16), nrow = 4, ncol = 4) # 4x4
w_output <- matrix(runif(12), nrow = 3, ncol = 4) # 3x4

b_hidden <- matrix(runif(4), nrow = 4, ncol = 1) # 4x1
b_output <- matrix(runif(3), nrow = 1, ncol = 3) # 1x3

# x_train 112x4
# x_test 39x4
# y_train 3x112
# y_test 39

forward_prop <- function(x) {
  z1 <- w_hidden %*% x + b_hidden[,1] # 4x4 %*% 1x4 + 4x1
  a1 <- relu(z1)
  z2 <- w_output %*% a1 + b_output[1, ]
  a2 <- softmax(z2)
  return(list(z1, a1, z2, a2))
}

test_pred <- apply(x_test, 1, forward_prop)
test_pred <- lapply(test_pred, function(x) x[[4]])
test_pred <- do.call(cbind, test_pred)
test_comparison <- apply(test_pred, 2, function(x) {
  return(which.max(x) - 1)
})
test_comparison <- ifelse( (test_comparison == y_test) == TRUE, 0, 1)
accuracy <- sum(test_comparison) / length(y_test)
accuracy

backward_prop <- function(z1, a1, z2, a2, x, y) {
  x <- as.matrix(x)
  # z1 4x1
  # a1 4x1
  # z2 3x1
  # a2 3x1
  # x 4x1
  # y 3
  dc_da2 <-  a2 - y  # 3x1 - 3 = 3x1
  da2_dz2 <- softmax_derivative(z2) # 3x1 = 3x3
  dz2_da1 <- w_output #3x4
  dz2_dw2 <- a1 # 4x1
  dz2_db2 <- 1 
  da1_dz1 <- relu_derivative(z1) # 4x1 = 4x1
  dz1_dw1 <- x # 4x1
  dz1_db1 <- 1
  dc_dw2 <- t(t(dc_da2) %*% da2_dz2) %*% t(dz2_dw2) # 3x4 --> correct
  dc_db2 <- t(t(dc_da2) %*% da2_dz2) * dz2_db2 # 3x1 --> correct
  dc_da1 <- t(dz2_da1) %*% dc_da2 * da1_dz1 # 4x1
  dc_dw1 <- outer(dc_da1[,1] * da1_dz1[,1], x[,1]) # 4x4 --> correct
  dc_db1 <- dc_da1 # 4x1 --> correct
  return(list(dc_dw1, dc_db1, dc_dw2, dc_db2)) 
}

test_backprop <- function() {
  res <- forward_prop(x_test[1,])
  z1 <- as.matrix(res[[1]])
  a1 <- as.matrix(res[[2]])
  z2 <- as.matrix(res[[3]])
  a2 <- as.matrix(res[[4]])
  test <- backward_prop(z1, a1, z2, a2, x_test[1, ], y_test[1])
}
test_backprop()

L <- 0.05
n <- dim(x_train)[1]
for(i in seq(1, 20000)) {
  idx <- sample(1:n, 1, replace = FALSE)
  x_sample <- (x_train[idx, ])
  y_sample <- y_train[, idx]
  res <- forward_prop(x_sample)
  z1 <- res[[1]]
  a1 <- res[[2]]
  z2 <- res[[3]]
  a2 <- res[[4]]
  res <- backward_prop(z1, a1, z2, a2, x_sample, y_sample)
  dw1 <- res[[1]] # 4x4
  db1 <- res[[2]] # 4x1
  dw2 <- res[[3]] # 3x4
  db2 <- res[[4]] # 3x1
  
  w_hidden <- w_hidden - L * dw1
  b_hidden <- b_hidden - L * db1[,1]
  w_output <- w_output - L * dw2
  b_output <- b_output - L * db2[, 1]
}

test_pred <- apply(x_test, 1, forward_prop)
test_pred <- lapply(test_pred, function(x) x[[4]])
test_pred <- do.call(cbind, test_pred)
test_comparison <- apply(test_pred, 2, function(x) {
  return(which.max(x) - 1)
})
test_comparison <- ifelse( (test_comparison == y_test) == TRUE, 0, 1)
accuracy <- sum(test_comparison) / length(y_test)
accuracy

In [None]:
# Verification via keras
library(keras)
library(tensorflow)

model <- keras_model_sequential(input_shape = c(4)) %>%
  layer_flatten() %>%
  layer_dense(1, activation = "relu") %>%
  layer_dropout(0.2) %>%
  layer_dense(3)

loss_fn <- loss_sparse_categorical_crossentropy(from_logits = TRUE)

model %>% compile(
  optimizer = "adam",
  loss = loss_fn,
  metrics = "accuracy"
)


df <- iris
n <- dim(df)[1]
df <- df[sample(1:n, n, replace = FALSE),]
train_data <- df[1:round(n*0.75), ]
test_data <- df[round(n*0.75):n, ]
x_train <- train_data[, 1:4]
x_test <- test_data[, 1:4]
means <- apply(x_train, 2, mean)
sds <- apply(x_train, 2, sd)
for(i in 1:4) {
  x_train[, i] <- (x_train[, i] - means[i]) / sds[i]
  x_test[, i] <- (x_test[, i] - means[i]) / sds[i]
}
y_train <- as.integer(train_data[, 5]) - 1
y_test <- as.integer(test_data[, 5]) - 1
x_train <- array(stack(x_train)$values, dim = dim(x_train))
x_test <- array(stack(x_test)$values, dim = dim(x_test))
history <- model %>% fit(x_train, y_train, epochs = 250)
plot(history)
model %>% evaluate(x_test,  y_test, verbose = 2)