<a href="https://colab.research.google.com/github/Intertangler/ML4biotech/blob/main/cb206v_exercise4_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## exercise - linear regression
The dataset in this exercise was generated to simulate a large study in which many individuals have both a gene expression profile (multiple gene expression levels) linked to measurements of their blood sugar level, and then as a longitudinal study, those individuals who later develop diabetes are recorded (this is a high-risk group, let's say). The data has been engineered to contain some interesting structure which we will explore over the next few lessons. For today though, our task is to establish a model to predict the blood sugar concentration from the expression data using multivariate linear regression.

In [None]:
"""
First, let's import some multidimensional data and have a look at it. We will be
using dataframes - basically like excel spreadsheets, with columns and rows.
Try printing out the dataframe to examine its contents and its header labels.
"""
import pandas as pd
url = "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_profile_blood_sugar_diabetes_data.csv"

df = pd.read_csv(url) #this line will convert the raw csv file to a pandas "dataframe" object, which is a bit like a spreadsheet


all_samples = df.iloc[:, :-2].T.values
Pathological = df['Pathogenic_Label'].values
blood_sugar_levels = df['Blood_Sugar'].values

In [None]:
"""
have a look at the dataset
"""
df

In [None]:
"""
Next, let's run a visualization of our data. First a matrix displaying genes vs
individuals in our dataset, with the brightness of each pixel indicating the
expression level. Then we will make a histogram showing the distribution of
blood sugar levels in our dataset. In addition, we will color each bar according
to the frequency of patients who develop diabetes later in life - the longitudinal
part of this data.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from scipy.stats import skewnorm

# Plot the heatmap
plt.rcParams.update({'font.size': 40})
f = plt.figure()
f.set_figwidth(12)
f.set_figheight(10)
sns.heatmap(df.drop(['Pathogenic_Label', 'Blood_Sugar'], axis=1).T.apply(np.log), cmap="Greys")
plt.show()

# Create predictors
X = all_samples.T

# Define the number of bins and get the bin edges
num_bins = 50
hist, bin_edges = np.histogram(blood_sugar_levels, bins=num_bins)

# Calculate the proportion of pathogenic individuals in each bin
bin_labels = np.digitize(blood_sugar_levels, bins=bin_edges)
proportions = [np.mean(Pathological[bin_labels == i]) for i in range(1, len(bin_edges))]

# Get a colormap instance and map the proportions to colors
cmap = plt.cm.get_cmap('coolwarm')
bin_colors = cmap(proportions)

# Plotting histogram with color indicating the proportion of pathogenic individuals
plt.figure()
plt.bar(bin_edges[:-1], hist, width=np.diff(bin_edges), color=bin_colors, edgecolor='white')
plt.xlabel('Blood Sugar Levels (mg/dl)')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.colorbar(plt.cm.ScalarMappable(cmap=cmap), label='Diabetes proportion')
plt.show()

In [None]:
"""
Use this guide to help you complete the linear algebra functions needed to do the
normal equations part of the next section
"""
import numpy as np

# Define a 3x2 matrix and a 2x2 matrix for matmul operations
Matrix1 = np.array([[1, 2], [3, 4], [5, 6]])  # 3x2 matrix
Matrix2 = np.array([[7, 8], [9, 10]])         # 2x2 matrix
Vector = np.array([11, 12])                   # 1x2 vector
Vector1 = np.array([1, 2, 3])                 # 3-element vector for dot product
Vector2 = np.array([4, 5, 6])                 # 3-element vector for dot product

# Matrix multiplication of two matrices using matmul
matmul_matrices = np.matmul(Matrix1, Matrix2)
print("\nMatrix Multiplication of Matrix1 (3x2) and Matrix2 (2x2):\n", matmul_matrices)

# Matrix-vector multiplication using matmul
matmul_matrix_vector = np.matmul(Matrix1, Vector)
print("\nMatrix-Vector Multiplication of Matrix1 (3x2) and Vector (2-element):\n", matmul_matrix_vector)

# Transpose of Matrix1
transpose_Matrix1 = np.transpose(Matrix1)
print("\nTranspose of Matrix1 (3x2):\n", transpose_Matrix1)

# Inverse of a 2x2 matrix (for example purposes, using Matrix2)
inverse_Matrix2 = np.linalg.inv(Matrix2)
print("\nInverse of Matrix2 (2x2):\n", inverse_Matrix2)


### complete the missing lines below to perform linear regression and predict blood sugar level on the basis of individuals' gene expression profile

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import numpy as np

def fit_normal_equations(X, y):
    # Add a column of ones to X, this will be for the intercept values
    X = np.c_[np.ones((X.shape[0], 1)), X]


#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟## Step 1: Compute X^T (transpose of X)
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟## Step 2: Compute X^T * X (matrix multiplication)
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟## Step 3: Compute the inverse of (X^T * X)
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟## Step 4: Compute X^T * y (matrix multiplication)
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#  theta = (X^T * X)^−1*X^T*y Solve the normal equations

    print("Estimated parameters:")
    print("Theta:")
    print(theta)

    return theta



def predict_normal(X, theta):

    num_samples = X.shape[0] # Get number of samples in the dataset
    ones_column = np.ones((num_samples, 1)) # Create array of ones with the same number of rows as X

    # Add a column of ones to the start of X to account for the intercept term
    # This is often denoted as X_b where 'b' stands for bias (or intercept)
    X_b = np.c_[ones_column, X]

    # Now we calculate the predictions using the formula:
    # predictions = X_b . theta
    # The dot product of X_b and theta gives us the predicted values
    predictions = np.matmul(X_b,theta)
    return predictions

X = all_samples.T
y = blood_sugar_levels

# Split data into training and test subsets. Use the train_test_split() function
#🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# X_train, X_test, y_train, y_test =

# Train model using  normal equations
theta = fit_normal_equations(X_train, y_train)

# predictions on the test set
y_pred = predict_normal(X_test, theta)

# rss
rss = np.sum((y_test - y_pred)**2)

# r-squared value
tss = np.sum((y_test - np.mean(y_test))**2)
r2 = 1 - (rss / tss)

print(f'Residual Sum of Squares: {rss}')
print(f'R-squared: {r2}')

# Plot true vs predicted
plt.scatter(y_test, y_pred)
plt.xlabel('True Blood Sugar Level (mg/dl)')
plt.ylabel('Predicted (mg/dl)')
plt.title('True vs Predicted Blood Sugar Levels')
plt.show()


## extra - doing linear regression with MLE by iterative gradient descent

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize

def fit_mle(X, y):
    m = len(y)
    X_b = np.c_[np.ones((m, 1)), X]

    def neg_log_likelihood(params):
        theta = params[:-1].reshape(-1, 1)
        sigma_squared = params[-1]

        if sigma_squared <= 0:
            return np.inf  # Return a large value to indicate that sigma_squared must be positive

        residuals = y.reshape(-1, 1) - X_b.dot(theta)
        ll = -0.5 * m * np.log(2 * np.pi * sigma_squared) - (1/(2 * sigma_squared)) * np.sum(residuals ** 2)
        return -ll

    # Initial guess
    init_params = np.zeros(X_b.shape[1] + 1)
    init_params[-1] = 1  # Initial guess for sigma_squared

    # Optimize the nll function
    result = minimize(neg_log_likelihood, init_params)

    # Extratc parameters
    theta = result.x[:-1].reshape(-1, 1)
    sigma_squared_estimated = result.x[-1]

    print("Estimated parameters:")
    print("Theta:", theta)
    print("Sigma squared:", sigma_squared_estimated)

    return theta, sigma_squared_estimated


    return theta, history

def predict(X, theta):
    X_b = np.c_[np.ones((len(X), 1)), X]
    return X_b.dot(theta)

#  predictors
X = all_samples.T

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, blood_sugar_levels, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model with MLE
theta, sigma_squared_estimated = fit_mle(X_train_scaled, y_train)

# Make predictions
y_pred = predict(X_test_scaled, theta).flatten()

# Calculate Residual Sum of Squares
rss = ((y_test - y_pred) ** 2).sum()

# the R-squared value
tss = ((y_test - y_test.mean()) ** 2).sum()
r2 = 1 - (rss / tss)

print(f'Residual Sum of Squares: {rss}')
print(f'R-squared: {r2}')

# Plot true vs predicted
plt.scatter(y_test, y_pred)
plt.ylabel('Predicted (mg/dl)')
plt.xlabel('True Blood Sugar Level (mg/dl)')
plt.title('True vs Predicted Blood Sugar Levels')
plt.show()
