<a href="https://colab.research.google.com/github/Intertangler/ML4biotech/blob/main/linear_regression_exercise_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## exercise - linear regression
The dataset in this exercise was generated to simulate a large study in which many individuals have both a gene expression profile (multiple gene expression levels) linked to measurements of their blood sugar level, and then as a longitudinal study, those individuals who later develop diabetes are recorded (this is a high-risk group, let's say). The data has been engineered to contain some interesting structure which we will explore over the next few lessons. For today though, our task is to establish a model to predict the blood sugar concentration from the expression data using multivariate linear regression.

In [1]:
%load_ext rpy2.ipython

In [2]:
%%R

#First, let's import some multidimensional data and have a look at it. We will be
#using dataframes - basically like excel spreadsheets, with columns and rows.
#Try printing out the dataframe to examine its contents and its header labels.

library(readr)

# Read the data from the GitHub URL
url <- "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_profile_blood_sugar_diabetes_data.csv"

df <- read_csv(url)

# Extract data to the appropriate variables
all_samples <- as.matrix(df[,-(ncol(df)-1):-(ncol(df))])
pathogenic_labels <- df$Pathogenic_Label
blood_sugar_levels <- df$Blood_Sugar



Rows: 3000 Columns: 402
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (402): gene_0, gene_1, gene_2, gene_3, gene_4, gene_5, gene_6, gene_7, g...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
%%R
#take a look at the dataframe
print(colnames(df))
print(df)

  [1] "gene_0"           "gene_1"           "gene_2"          
  [4] "gene_3"           "gene_4"           "gene_5"          
  [7] "gene_6"           "gene_7"           "gene_8"          
 [10] "gene_9"           "gene_10"          "gene_11"         
 [13] "gene_12"          "gene_13"          "gene_14"         
 [16] "gene_15"          "gene_16"          "gene_17"         
 [19] "gene_18"          "gene_19"          "gene_20"         
 [22] "gene_21"          "gene_22"          "gene_23"         
 [25] "gene_24"          "gene_25"          "gene_26"         
 [28] "gene_27"          "gene_28"          "gene_29"         
 [31] "gene_30"          "gene_31"          "gene_32"         
 [34] "gene_33"          "gene_34"          "gene_35"         
 [37] "gene_36"          "gene_37"          "gene_38"         
 [40] "gene_39"          "gene_40"          "gene_41"         
 [43] "gene_42"          "gene_43"          "gene_44"         
 [46] "gene_45"          "gene_46"          "gene_47"  

In [None]:
%%R
# Next, let's run a visualization of our data. First a matrix displaying genes vs
# individuals in our dataset, with the brightness of each pixel indicating the
# expression level. Then we will make a histogram showing the distribution of
# blood sugar levels in our dataset. In addition, we will color each bar according
# to the frequency of patients who develop diabetes later in life - the longitudinal
# part of this data.

library(ggplot2)
library(dplyr)
library(RColorBrewer)

# Load the data
# (assuming df is already loaded with your data)

# Plot the heatmap
heatmap(as.matrix(log(df %>% select(-Pathogenic_Label, -Blood_Sugar))), Rowv = NA, Colv = NA, scale="none", col = brewer.pal(9, "Greys"), labRow = NA, labCol = NA)


# Create predictors
# (assuming all_samples is already defined)

# Define the number of bins and get the bin edges
num_bins <- 50
hist_data <- hist(blood_sugar_levels, breaks = num_bins, plot = FALSE)

# Calculate the proportion of pathogenic individuals in each bin
bin_labels <- cut(blood_sugar_levels, breaks = hist_data$breaks, include.lowest = TRUE, labels = FALSE)
proportions <- sapply(1:length(hist_data$counts), function(i) mean(pathogenic_labels[bin_labels == i]))

# Get a colormap instance and map the proportions to colors
colormap <- colorRampPalette(brewer.pal(9, "RdBu"))
bin_colors <- colormap(length(proportions))

# Plotting histogram with color indicating the proportion of pathogenic individuals
ggplot() +
  geom_bar(aes(x = hist_data$mids, y = hist_data$counts, fill = proportions), stat = "identity", width = diff(hist_data$breaks)[1]) +
  scale_fill_gradientn(colors = rev(bin_colors), name = "Diabetes proportion") +
  labs(x = 'Blood Sugar Levels (mg/dl)', y = 'Frequency') +
  theme_minimal()


In [None]:
%%R
#install.packages("optimx")
install.packages("caret")

In [None]:
%%R

# Use this guide to help you complete the linear algebra functions needed to do the
# normal equations part of the next section
library(optimx)
library(MASS)
library(caret)
Matrix1 <- matrix(c(3, 2, 4, 1), nrow=2, ncol=2, byrow = TRUE)
Matrix2 <- matrix(c(5, 7, 6, 8), nrow=2, ncol=2, byrow = TRUE)
Vector <- c(9, 10)

# Dot product
dot_product_matrices <- Matrix1 %*% Matrix2
print("Dot Product of Matrix1 and Matrix2:")
print(dot_product_matrices)

# Dot product of matrix and vector
dot_product_matrix_vector <- Matrix1 %*% Vector
print("Dot Product of Matrix1 and Vector:")
print(dot_product_matrix_vector)

# Transpose of a matrix
transpose_Matrix1 <- t(Matrix1)
print("Transpose of Matrix1:")
print(transpose_Matrix1)

# Inverse of a matrix
inverse_Matrix1 <- solve(Matrix1)
print("Inverse of Matrix1:")
print(inverse_Matrix1)



### complete the missing lines below to perform linear regression and predict blood sugar level on the basis of individuals' gene expression profile

In [None]:
%%R
library(caret)
library(ggplot2)

fit_normal_equations <- function(X, y) {
  X_b <- cbind(1, X)  # Add a column of ones to X to include an intercept in the model

  # Solve the normal equations
  theta <-  #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#  w = (X^T * X)^−1*X^T*y Solve the normal equations

  print("Estimated parameters:")
  print("Theta:")
  print(theta)

  return(list(theta = theta))
}

predict_normal <- function(X, theta) {
  X_b <- cbind(1, X)  # Add a column of ones to X
  return(X_b %*% theta)  # Make predictions w fitted values of theta
}

# set up predictors
X <- all_samples  # input
y <- blood_sugar_levels # outcome vaariable

# Check dimensions and lengths
print(paste('Dimensions of X:', dim(X)))
print(paste('Length of y:', length(y)))

# Split the data into training and test sets
set.seed(42)
trainIndex <- #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#   use the createDataPartition() function

# Check the length of trainIndex
print(paste('Length of trainIndex:', length(trainIndex)))

# Now try indexing
X_train <- X[trainIndex,]
X_test <- X[-trainIndex,]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# train the model with the data using the normal equations
model <- fit_normal_equations(X_train, y_train)

# Make predictions on test set
y_pred <- predict_normal(X_test, model$theta)

# Calculate the RSS
rss <- sum((y_test - y_pred)^2)

# Calculate the r-squared
tss <- sum((y_test - mean(y_test))^2)
r2 <- 1 - (rss / tss)

print(paste('Residual Sum of Squares:', rss))
print(paste('R-squared:', r2))

# plotting true vs predicted
ggplot() +
  geom_point(aes(x = y_test, y = y_pred)) +
  xlab('True Blood Sugar Level (mg/dl)') +
  ylab('Predicted (mg/dl)') +
  ggtitle('True vs Predicted Blood Sugar Levels')

