<a href="https://colab.research.google.com/github/Intertangler/ML4biotech/blob/main/logistic_regression_exercise_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## exercise - logistic regression
In this exercise we will look again at simulated data linking blood sugar levels, the pathological occurence of diabetes, and gene expression data for high risk individuals. In contrast to the previous exercise, we will be attempting to construct a classifier using logistic regression that uses the
gene expression profiles of individuals to predict whether or not they receive
the pathological label.

In [None]:
"""
First, let's import some multidimensional data and have a look at it. We will be
using dataframes - basically like excel spreadsheets, with columns and rows.
Try printing out the dataframe to examine its contents and its header labels.
"""
import pandas as pd
url = "https://raw.githubusercontent.com/Intertangler/ML4biotech/main/gene_expression_data.csv"

df = pd.read_csv(url) #this line will convert the raw csv file to a pandas "dataframe" object, which is a bit like a spreadsheet


# Data Preprocessing
all_samples = df.iloc[:, :-2].values
expression = all_samples
genes = expression.shape[1] # Added this line
pathological_labels = df['Pathological'].values
blood_sugar_levels = df['Blood_Sugar'].values

In [None]:
df

In [None]:
"""
Next, let's run a visualization of our data. First a matrix displaying genes vs
individuals in our dataset, with the brightness of each pixel indicating the
expression level. Then we will make a histogram showing the distribution of
blood sugar levels in our dataset. In addition, we will color each bar according
to the frequency of patients who develop diabetes later in life - the longitudinal
part of this data.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from scipy.stats import skewnorm

# Plot the heatmap
plt.rcParams.update({'font.size': 40})
f = plt.figure()
f.set_figwidth(12)
f.set_figheight(10)
sns.heatmap(df.drop(['Pathological', 'Blood_Sugar'], axis=1).T.apply(np.log), cmap="Greys")
plt.show()

# Create predictors
X = all_samples.T

# Define the number of bins and get the bin edges
num_bins = 50
hist, bin_edges = np.histogram(blood_sugar_levels, bins=num_bins)

# Calculate the proportion of pathogenic individuals in each bin
bin_labels = np.digitize(blood_sugar_levels, bins=bin_edges)
proportions = [np.mean(pathological_labels[bin_labels == i]) for i in range(1, len(bin_edges))]

# Get a colormap instance and map the proportions to colors
cmap = plt.cm.get_cmap('coolwarm')
bin_colors = cmap(proportions)

# Plotting histogram with color indicating the proportion of pathogenic individuals
plt.figure()
plt.bar(bin_edges[:-1], hist, width=np.diff(bin_edges), color=bin_colors, edgecolor='white')
plt.xlabel('Blood Sugar Levels (mg/dl)')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.colorbar(plt.cm.ScalarMappable(cmap=cmap), label='Diabetes proportion')
plt.show()

## exercise - correlation analysis on the linear regression results
Last time we performed linear regression on the data to build a model that can
predict the blood sugar levels on the basis of the gene expression profiles of
each individual. When we plot the results, we see a visual trend. How does the model perform when we train it on genes with greater or lesser correlation with blood sugar?

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

def calculate_pearson_correlation(x, y):
    """
    Calculate the Pearson correlation coefficient between two arrays.
    """
    # Calculate the means of x and y
    #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# mean of first variable
    #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# mean of second variable


    # #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# Calculate the numerator and the denominator terms for Pearson correlation
    #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#
    #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#


    # #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟# Calculate Pearson correlation


    return corr

high_corr_genes = []
low_corr_genes = []
genes = expression.shape[1]

for i in range(genes):

    gene_expression = expression[:, i] # The expression levels of the ith gene for all individuals
    corr = calculate_pearson_correlation(blood_sugar_levels, gene_expression) # Calculate the correlation coefficient

    if np.abs(corr) > 0.1:  # Check if the correlation is high
        high_corr_genes.append(i)
    elif np.abs(corr) < 0.05:  # ... or low
        low_corr_genes.append(i)

high_corr_genes = np.array(high_corr_genes)
low_corr_genes = np.array(low_corr_genes)

print(f"Genes with high correlation with blood sugar level: {high_corr_genes}")
print(f"Genes with low correlation with blood sugar level: {low_corr_genes}")

def fit_normal_equations(X, y):
    X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add a column of ones to X
    theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)  # Solve the normal equations
    #print("Estimated parameters:")
    #print("Theta:")
    #print(theta)
    return theta

def predict_normal(X, theta):
    num_samples = X.shape[0]  # Get the number of samples
    ones_column = np.ones((num_samples, 1))  # Create a column of ones
    X_b = np.c_[ones_column, X]  # Add the column to X
    predictions = X_b.dot(theta)  # Calculate the predictions
    return predictions


X_train, X_test, y_train, y_test = train_test_split(expression, blood_sugar_levels, test_size=0.3, random_state=428) #slipt the data into training and test sets

# all genes
theta_all = fit_normal_equations(X_train, y_train) # Fit the model using normal equations
y_pred_all = predict_normal(X_test, theta_all) # make prediction
# high correlation genes
theta_high = fit_normal_equations(X_train[:, high_corr_genes], y_train)
y_pred_high = predict_normal(X_test[:, high_corr_genes], theta_high)
# low correlation genses
theta_low = fit_normal_equations(X_train[:, low_corr_genes], y_train)
y_pred_low = predict_normal(X_test[:, low_corr_genes], theta_low)


# plot part
plt.figure(figsize=(21, 7))

# All genes
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred_all)
plt.title('All ')
plt.xlabel('Ground Truth')
plt.ylabel('Predicted')

# High-correlation genes
plt.subplot(1, 3, 2)
plt.scatter(y_test, y_pred_high)
plt.title('High')
plt.xlabel('Ground Truth')
plt.ylabel('Predicted')

# Low-correlation genes
plt.subplot(1, 3, 3)
plt.scatter(y_test, y_pred_low)
plt.title('Low')
plt.xlabel('Ground Truth')
plt.ylabel('Predicted')

plt.tight_layout()
plt.show()


## exercise - logistic regression


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import numpy as np

import numpy as np

def sigmoid(a):
    return 1 / (1 + np.exp(-a))

def compute_gradient(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    return (1 / m) * np.dot(X.T, (h - y))

def compute_nll(y, h):
    return -np.mean(y * np.log(h) + (1 - y) * np.log(1 - h))

def fit_logistic_regression_with_nll(X, y, lr=0.01, epochs=100000):
    X = np.c_[np.ones((X.shape[0], 1)), X]  # Add a column of ones for the bias term
    np.random.seed(428)  # Set the seed for reproducibility
    theta = np.random.rand(X.shape[1]) * 0.1  # Initialize with small random numbers

    for epoch in range(epochs):
      # update the value of theta with a step computed using the gradient
        gradient = #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#
        theta -= #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#

        nll = compute_nll(y, sigmoid(np.dot(X, theta)))

        if epoch % 1000 == 0:  # Print NLL every 1000 epochs for monitoring
            print(f"Epoch {epoch}, NLL: {nll}")

    return theta


def predict_logistic_regression(X, theta):
    X = np.c_[np.ones((X.shape[0], 1)), X]
    #compute the predicted outcomes by matrix multiplying the data with the params
    prob = #🌟🌟🌟🌟 YOUR CODE HERE 🌟🌟🌟🌟#
    return prob, (prob > 0.5).astype(int)


# Data sets: [(subset, title)]
data_sets = [
    (expression, 'All Genes'),
    (expression[:, high_corr_genes], 'High-Correlation Genes'),
    (expression[:, low_corr_genes], 'Low-Correlation Genes')
]

# Loop through data subsets
for data, title in data_sets:
    X_train, X_test, y_train, y_test = train_test_split(data, pathological_labels, test_size=0.3, random_state=1234)

    # Fit the model
    theta = fit_logistic_regression_with_nll(X_train, y_train, lr=.0001, epochs=50000)

    # Make predictions
    y_pred_proba, y_pred = predict_logistic_regression(X_test, theta)

    print("Predicted probabilities:", y_pred_proba)
    print("Predicted labels:", y_pred)
    print("Model Coefficients:", theta[1:])
    print("Model Intercept:", theta[0])

    # ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Other metrics
    accuracy = accuracy_score(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    logit_values = np.dot(X_test, theta[1:].T) + theta[0]  # Calculate the logit values (linear combination of weights and features)

    # Create scatter plot with line plot of sigmoidal function
    plt.figure(figsize=(24, 6))
    plt.scatter(logit_values, y_pred_proba, c=y_test, cmap="bwr", alpha=0.7, s=5 )
    plt.plot(np.arange(-10, 10, 0.1), 1 / (1 + np.exp(-np.arange(-10, 10, 0.1))), c="grey", lw=0.3)
    plt.xlabel("Logit")
    plt.ylabel("Probability")
    plt.title(title)
    plt.show()

    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'area under curve = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {title}')
    plt.legend(loc='lower right')
    plt.show()

    print(f'Accuracy for {title}: {accuracy}')
    print(f'Confusion Matrix for {title}:\n{conf_mat}')
    print(f'F1 Score for {title}: {f1}\n')


### extra - logistic regression with blood sugar for comparison

In [None]:
blood_sugar_data = df[['Blood_Sugar']].values  # Make sure it's a 2D array
pathological_labels = df['Pathological'].values

blood_sugar_data_set = [(blood_sugar_data, 'Blood Sugar Levels')]

for data, title in blood_sugar_data_set:
    X_train, X_test, y_train, y_test = train_test_split(data, pathological_labels, test_size=0.3, random_state=1234)

    theta = fit_logistic_regression_with_nll(X_train, y_train, lr=0.01, epochs=50000)
    y_pred_proba, y_pred = predict_logistic_regression(X_test, theta)

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    accuracy = accuracy_score(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    logit_values = np.dot(X_test, theta[1:].T) + theta[0]

    plt.figure(figsize=(24, 6))
    plt.scatter(logit_values, y_pred_proba, c=y_test, cmap="bwr", alpha=0.7, s=5)
    plt.plot(np.arange(-10, 10, 0.1), 1 / (1 + np.exp(-np.arange(-10, 10, 0.1))), c="grey", lw=0.3)
    plt.xlabel("Logit")
    plt.ylabel("Probability")
    plt.title(title)
    plt.show()

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {title}')
    plt.legend(loc='lower right')
    plt.show()

    print(f'Accuracy for {title}: {accuracy}')
    print(f'Confusion Matrix for {title}:\n{conf_mat}')
    print(f'F1 Score for {title}: {f1}\n')
