<a href="https://colab.research.google.com/github/Intertangler/ML4biotech/blob/main/cb206v_exercise6_deepneuralnetworks_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## import the data
This artificial data represents the gene expression levels (normalized already) of two separate genes. The class labels associated with each data point indicate the presence or absence 1 or 0 of a particular downstream phenotype influenced by the genes. Our goal here is to detect a nonlinear relationship between the two gene expression levels that strongly correlates with the downstream phenotype.

In [None]:
library(readr)
url <- "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_expression_XOR.csv"
df <- read_csv(url)
X <- df[, -ncol(df)]
y <- df[, ncol(df)]


In [None]:
install.packages("caret")
install.packages("caTools")
install.packages("pROC")
library(readr)
library(dplyr)
# Load required packages
library(caTools)
library(pROC)
# Plotting
# This part would go into a separate block if you are running in R Studio or similar
library(ggplot2)

In [None]:
url <- "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_expression_XOR.csv"
df <- read_csv(url)

X <- df %>% select(-last_col())
y <- df %>% select(last_col())

# Convert data frames to matrices or vectors as needed
X <- as.matrix(X)
y <- as.vector(y[[1]])

# shuffle data and labels
shuffle_idx <- sample(seq_len(nrow(X)), replace = FALSE)
X <- X[shuffle_idx, , drop = FALSE]
y <- y[shuffle_idx]

set.seed(428) # Seed random number gen

# Split data to training/testing subsets
split <- sample.split(y, SplitRatio = 0.8)
X_train <- X[split, ]
y_train <- y[split]
X_test <- X[!split, ]
y_test <- y[!split]

# Prepare the data frame for ggplot
data_plot <- data.frame(X1 = X_train[, 1], X2 = X_train[, 2], Class = as.factor(y_train))

# Plot data points using ggplot
ggplot(data = data_plot, aes(x = X1, y = X2, color = Class)) +
  geom_point() +
  xlab("Gene Expression 1") +
  ylab("Gene Expression 2") +
  scale_color_manual(name = "Pehnotype", values = c("0" = "red", "1" = "blue")) +
  theme_minimal() +
  theme(legend.position = "bottom")

## exercise
Complete the missing lines.
Use the Keras library to construct a deep neural network - define the model type, its architecture by deciding the number of layers and nodes, the activation function. Compile the model with appropriate loss function, and learning rate scheduling mechanism.

The code after this section will then fit the model to the training input and output data. And then a test data set that has been set aside will be used to score the performance with an ROC curve. Compare the performance of your model to a logistic regression. If things have been set up right, the neural network should outperform the logistic regression.


In [None]:
# load required packages
library(nnet)
library(pROC)
library(caTools)
library(ggplot2)
library(dplyr)

# set seed for reproducibility
set.seed(428)


#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#  # define the neural network model with 20 hidden units and a sigmoid output





# make probability predictions on the test set
mlp_prob <- predict(mlp_model, X_test, type = "raw")

# generate ROC data for MLP
roc_data_mlp <- roc(y_test, mlp_prob)
mlp_fpr <- 1 - roc_data_mlp$specificities
mlp_tpr <- roc_data_mlp$sensitivities
mlp_auc <- auc(roc_data_mlp)

# train a logistic regression model for comparison
logreg_model <- glm(y_train ~ ., data = data.frame(y_train, X_train), family = binomial)

# make probability predictions on the test set using logistic regression
logreg_prob <- predict(logreg_model, newdata = data.frame(X_test), type = "response")

# generate ROC data for logistic regression
roc_data_logreg <- roc(y_test, logreg_prob)
logreg_fpr <- 1 - roc_data_logreg$specificities
logreg_tpr <- roc_data_logreg$sensitivities
logreg_auc <- auc(roc_data_logreg)

# minimum number of points between the two ROC curves
min_length <- min(length(logreg_fpr), length(mlp_fpr))

# Interpolate both ROC curves to have the same number of points
logreg_fpr_interp <- approx(x = seq_along(logreg_fpr), y = logreg_fpr, n = min_length)$y
logreg_tpr_interp <- approx(x = seq_along(logreg_tpr), y = logreg_tpr, n = min_length)$y
mlp_fpr_interp <- approx(x = seq_along(mlp_fpr), y = mlp_fpr, n = min_length)$y
mlp_tpr_interp <- approx(x = seq_along(mlp_tpr), y = mlp_tpr, n = min_length)$y

# data frame for ggplot with interpolated values
roc_df <- data.frame(
  FPR = c(logreg_fpr_interp, mlp_fpr_interp),
  TPR = c(logreg_tpr_interp, mlp_tpr_interp),
  Model = factor(rep(c("Logistic Regression", "MLP"), each = min_length))
)

# ROC plot
ggplot(roc_df, aes(x = FPR, y = TPR, color = Model)) +
  geom_line() +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
  xlab("False Positive Rate") +
  ylab("True Positive Rate") +
  ggtitle(sprintf("ROC Curve\nLogistic AUC = %0.2f, MLP AUC = %0.2f", logreg_auc, mlp_auc)) +
  theme_minimal() +
  theme(legend.position = "bottom")

