<a href="https://colab.research.google.com/github/Intertangler/ML4biotech/blob/main/cb206v_exercise4_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## exercise - linear regression
The dataset in this exercise was generated to simulate a large study in which many individuals have both a gene expression profile (multiple gene expression levels) linked to measurements of their blood sugar level, and then as a longitudinal study, those individuals who later develop diabetes are recorded (this is a high-risk group, let's say). The data has been engineered to contain some interesting structure which we will explore over the next few lessons. For today though, our task is to establish a model to predict the blood sugar concentration from the expression data using multivariate linear regression.

In [None]:
#First, let's import some multidimensional data and have a look at it. We will be
#using dataframes - basically like excel spreadsheets, with columns and rows.
#Try printing out the dataframe to examine its contents and its header labels.

library(readr)

# Read the data from the GitHub URL
url <- "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_profile_blood_sugar_diabetes_data.csv"

df <- read_csv(url)

# Extract data to the appropriate variables
all_samples <- as.matrix(df[,-(ncol(df)-1):-(ncol(df))])
pathogenic_labels <- df$Pathogenic_Label
blood_sugar_levels <- df$Blood_Sugar



In [None]:
#take a look at the dataframe
print(colnames(df))
print(df)

In [None]:
# Next, let's run a visualization of our data. First a matrix displaying genes vs
# individuals in our dataset, with the brightness of each pixel indicating the
# expression level. Then we will make a histogram showing the distribution of
# blood sugar levels in our dataset. In addition, we will color each bar according
# to the frequency of patients who develop diabetes later in life - the longitudinal
# part of this data.

library(ggplot2)
library(dplyr)
library(RColorBrewer)

# Load the data
# (assuming df is already loaded with your data)

# Plot the heatmap
heatmap(as.matrix(log(df %>% select(-Pathogenic_Label, -Blood_Sugar))), Rowv = NA, Colv = NA, scale="none", col = brewer.pal(9, "Greys"), labRow = NA, labCol = NA)


# Create predictors
# (assuming all_samples is already defined)

# Define the number of bins and get the bin edges
num_bins <- 50
hist_data <- hist(blood_sugar_levels, breaks = num_bins, plot = FALSE)

# Calculate the proportion of pathogenic individuals in each bin
bin_labels <- cut(blood_sugar_levels, breaks = hist_data$breaks, include.lowest = TRUE, labels = FALSE)
proportions <- sapply(1:length(hist_data$counts), function(i) mean(pathogenic_labels[bin_labels == i]))

# Get a colormap instance and map the proportions to colors
colormap <- colorRampPalette(brewer.pal(9, "RdBu"))
bin_colors <- colormap(length(proportions))

# Plotting histogram with color indicating the proportion of pathogenic individuals
ggplot() +
  geom_bar(aes(x = hist_data$mids, y = hist_data$counts, fill = proportions), stat = "identity", width = diff(hist_data$breaks)[1]) +
  scale_fill_gradientn(colors = rev(bin_colors), name = "Diabetes proportion") +
  labs(x = 'Blood Sugar Levels (mg/dl)', y = 'Frequency') +
  theme_minimal()


In [None]:
install.packages("optimx")
install.packages("caret")

In [None]:
# Use this guide to help you complete the linear algebra functions needed to do the
# normal equations part of the next section

# Define a 3x2 matrix and a 2x2 matrix for matrix multiplication
Matrix1 <- matrix(c(1, 2, 3, 4, 5, 6), nrow=3, ncol=2, byrow=TRUE)  # 3x2 matrix
Matrix2 <- matrix(c(7, 8, 9, 10), nrow=2, ncol=2, byrow=TRUE)       # 2x2 matrix
Vector <- c(11, 12)                                                 # 2-element vector
Vector1 <- c(1, 2, 3)                                               # 3-element vector for dot product
Vector2 <- c(4, 5, 6)                                               # 3-element vector for dot product

# Matrix multiplication of two matrices
matmul_matrices <- Matrix1 %*% Matrix2
cat("\nMatrix Multiplication of Matrix1 (3x2) and Matrix2 (2x2):\n")
print(matmul_matrices)

# Matrix-vector multiplication
matmul_matrix_vector <- Matrix1 %*% Vector
cat("\nMatrix-Vector Multiplication of Matrix1 (3x2) and Vector (2-element):\n")
print(matmul_matrix_vector)

# Transpose of Matrix1
transpose_Matrix1 <- t(Matrix1)
cat("\nTranspose of Matrix1 (3x2):\n")
print(transpose_Matrix1)

# Inverse of a 2x2 matrix (for example purposes, using Matrix2)
inverse_Matrix2 <- solve(Matrix2)
cat("\nInverse of Matrix2 (2x2):\n")
print(inverse_Matrix2)


### complete the missing lines below to perform linear regression and predict blood sugar level on the basis of individuals' gene expression profile

In [None]:
library(caret)
library(ggplot2)

fit_normal_equations <- function(X, y) {
    # Add a column of ones to X, this will be for the intercept values
    X <- cbind(1, X)


# 🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟 Step 1: Compute X^T (transpose of X)
# 🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟 Step 2: Compute X^T %*% X (matrix multiplication)
# 🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟 Step 3: Compute the inverse of (X^T %*% X)
# 🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟 Step 4: Compute X^T %*% y (matrix multiplication)
# 🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟 theta = (X^T %*% X)^−1 %*% X^T %*% y Solve the normal equations

    print("Estimated parameters:")
    print("Theta:")
    print(theta)

    return(theta)
}


predict_normal <- function(X, theta) {
  X_b <- cbind(1, X)  # Add a column of ones to X
  return(X_b %*% theta)  # Make predictions w fitted values of theta
}

# set up predictors
X <- all_samples  # input
y <- blood_sugar_levels # outcome vaariable

# Check dimensions and lengths
print(paste('Dimensions of X:', dim(X)))
print(paste('Length of y:', length(y)))

# Split the data into training and test sets #   use the createDataPartition() function
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# trainIndex <-

# Check the length of trainIndex
print(paste('Length of trainIndex:', length(trainIndex)))

# Now try indexing
X_train <- X[trainIndex,]
X_test <- X[-trainIndex,]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# train the model with the data using the normal equations
model <- fit_normal_equations(X_train, y_train)

# Make predictions on test set
y_pred <- predict_normal(X_test, model)  #  `model`, which contains the `theta` values


# Calculate the RSS
rss <- sum((y_test - y_pred)^2)

# Calculate the r-squared
tss <- sum((y_test - mean(y_test))^2)
r2 <- 1 - (rss / tss)

print(paste('Residual Sum of Squares:', rss))
print(paste('R-squared:', r2))

# plotting true vs predicted
ggplot() +
  geom_point(aes(x = y_test, y = y_pred)) +
  xlab('True Blood Sugar Level (mg/dl)') +
  ylab('Predicted (mg/dl)') +
  ggtitle('True vs Predicted Blood Sugar Levels')

