## Train a neural network on heart failure data

###  Recap data preparation from last time

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# look at heart failure data from last class
current_dir = os.getcwd()

heart_data = pd.read_csv("../data/heart_failure/heart_train.csv")
features = heart_data.drop(columns=["HeartDisease"])
label = heart_data["HeartDisease"]

prng = np.random.RandomState(20240311)  # ensure we have the same split as in last class
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=prng)

### Estimate simple logistic regression as a benchmark

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# preprocess
one_hot_encoder = OneHotEncoder(sparse_output=False, drop="first")
categorical_vars = heart_data.select_dtypes(include="object").columns.to_list()

column_transformer = ColumnTransformer(
    [("create_dummies", one_hot_encoder, categorical_vars)],
    remainder="passthrough"
)

# pipeline with lgoit
pipe_logit = Pipeline([
    ("preprocess", column_transformer),
    ("scale", MinMaxScaler()),
    ("logit", LogisticRegression(penalty=None))
])
pipe_logit.fit(X_train, y_train)

### Evaluate the performance by AUC

In [None]:
from sklearn.metrics import roc_auc_score

predicted_probs = pipe_logit.predict_proba(X_test)[:, 1]
auc_score_test = roc_auc_score(y_test, predicted_probs)
print(f"AUC on the test set for simple logit is {round(auc_score_test, 4)}")

### Estimate a neural network on the heart data

In [None]:
from sklearn.neural_network import MLPClassifier
pipe_mlp = Pipeline([
    ("preprocess", column_transformer),
    ("scale", MinMaxScaler()),
    ("MLP", MLPClassifier(random_state = prng))
])
pipe_mlp.fit(X_train, y_train)

In [None]:
pipe_mlp["MLP"].get_params()

In [None]:
predicted_probs_mlp = pipe_mlp.predict_proba(X_test)[:, 1]
auc_score_test_mlp = roc_auc_score(y_test, predicted_probs_mlp)
print(f"AUC on the test set for basic MLP is {round(auc_score_test_mlp, 4)}")

## Image recognition with neural networks

### Data

In [None]:
from keras.datasets import mnist

# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Look at the dimensions
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test:  {X_test.shape}")
print(f"y_test:  {y_test.shape}")

In [None]:
# Note that we work here with numpy arrays instead of pandas dataframes
type(X_train)

In [None]:
# Visualize some items in a grid
import matplotlib.pyplot as plt

fig, axs = plt.subplots(5, 5, figsize=(10,10))
for i, ax in enumerate(axs.flatten()):
    ax.imshow(X_train[i], cmap="binary")
    ax.axis("off")
    ax.set_title(f"Label: {y_train[i]}")
plt.tight_layout()
plt.show()

In [None]:
X_train[0,]

### Train/validation/test sets

In [None]:
# intentionally choose a small train set to decrease computational burden
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.8, random_state=prng)

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_val:  {X_val.shape}")
print(f"y_val:  {y_val.shape}")
print(f"X_test:  {X_test.shape}")
print(f"y_test:  {y_test.shape}")

### Benchmarks

In [None]:
# Benchmark #1 (silly):

from sklearn.metrics import accuracy_score
from statistics import mode

most_frequent = mode(y_train)
print(f"Most frequent element is: {most_frequent}")
accuracy_most_frequent = accuracy_score(y_val, np.repeat(most_frequent, len(y_val)))
print(f"Accuracy for our no-brainer model: {round(accuracy_most_frequent, 4)}")

We can train a state-of-the-art machine learning model almost as easily. It can be used as our second benchmark. Note that our train data is multidimensional, instead of having `k` features, we have `ixj` features. In order to apply a standard ML algorithm, we need to flatten our data first. 

Since we need to apply the flattening to both the train and the test data, a clever way to do this is to build this step into the pipeline. Since we are applying the transformation to the whole feature matrix rather than specific columns, we can use the `FunctionTransformer`, which can apply any user-defined function to the feature matrix.

In [None]:
# Benchmark #2 (RF):
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer

def flatten_data(X):
    return X.reshape(X.shape[0], -1)

# TODO:
# 1. create pipeline with two steps: flatten & model
# 2. fit the pipeline
# 3. get predicted classes for the validation set
# 4. calculate accuracy

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, predictions_rf)

# # visualize with seaborn
import seaborn as sns

plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d")
plt.title("Confusion matrix")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

### Sklearn MLP

In [None]:
def scale_features(X):
    return X / 255

# TODO:
# 1. create pipeline with three steps: flatten & scale & model
# 2. fit the pipeline
# 3. get predicted classes for the validation set
# 4. calculate accuracy

### Keras: simple model

While `scikit-learn` provides a wide range of machine learning algorithms, `keras` is specifically designed for building and training neural networks and deep learning models, making it more suitable for tasks involving complex patterns and large datasets. It offers several advanced features that `sklearn` does not, such as training networks with complex architectures (such as convolutional neural networks), or applying pre-processing techniques common to deep learning (such as scaling and flattening), or the ability to transfer learning from pre-trained networks.

Keras is a high-level neural network API that provides a simple and intuitive interface for building and training deep learning models. Keras can run on multiple backends, with TensorFlowTensor becoming the default. TensorFlow is a standalone, low-level deep learning library developed by Google.

In [None]:
from keras import __version__ as keras_version
print(keras_version)

In [None]:
# Convert target variables to categorical
num_classes = 10
y_sets = [y_train, y_test, y_val]
y_train, y_test, y_val = [to_categorical(y, num_classes=num_classes) for y in y_sets]

In [None]:
from keras.models import Sequential
from keras.layers import Input, Flatten, Rescaling, Dense
from keras.utils import to_categorical

# Build the model
model = Sequential([
    Input(shape=X_train.shape[1:]), 
    Flatten(), 
    Rescaling(1./255), 
    Dense(100, activation='relu'), 
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# 784*100+100
# 100*10+10

The core of the backend is indeed constructed around **tensors**. Tensors are akin to NumPy arrays, as they hold numerical values of different dimensions. What sets them apart is their specialized role in deep learning: they are equipped with built-in gradient computation, seamlessly integrate within a computational graph, and are capable of leveraging hardware accelerators like TPUs and GPUs.

In [None]:
model.layers[3].output

In [None]:
# Fit the model
keras.utils.set_random_seed(20240318)  # for reproducibility
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=512)

In [None]:
# Evaluation of the model on the validation set
scores = model.evaluate(X_val, y_val)
print(f"Accuracy for keras MLP: {round(scores[1], 4)}")

In [None]:
def plot_history(fit_history):
    plt.plot(fit_history['accuracy'], label='Training Accuracy')
    plt.plot(fit_history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()
    plt.show()

plot_history(history.history)

It seems that the maximum is not yet found. Let's train the network a little bit longer. Note that unless we recreate the model, the process starts from the point where it previously ended.

In [None]:
# TODO: run for at least 50 epochs, store the result in history_longer_train


In [None]:
total_history = {
    "accuracy": history.history["accuracy"] + history_longer_train.history["accuracy"],
    "val_accuracy": history.history["val_accuracy"] + history_longer_train.history["val_accuracy"],
}

In [None]:
plot_history(total_history)

In [None]:
model.evaluate(X_val, y_val)

### Regularization in neural networks

#### Early stopping

In [None]:
from keras.callbacks import EarlyStopping
from keras.models import clone_model

# to make sure the process starts over, we need to create a new model instanse and compile it
cloned_model = clone_model(model)
cloned_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit with early stopping
history_with_early_stopping = cloned_model.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=512, 
    callbacks=[EarlyStopping(monitor='val_accuracy')]
)

In [None]:
plot_history(history_with_early_stopping.history)

#### Dropout

In [None]:
from keras.layers import Dropout

# Build the model
regularized_model = Sequential([
    Input(shape=X_train.shape[1:]),
    Flatten(),
    Rescaling(1./255),
    Dense(100, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
regularized_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(regularized_model.summary())

In [None]:
history_regularized = regularized_model.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=512,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=2)] # two epochs without any improvement is still fine
)

In [None]:
plot_history(history_regularized.history)

### Keras: Deep network

In [None]:
# Build the model
deep_model = Sequential([    
    Input(shape=X_train.shape[1:]),
    Flatten(),
    Rescaling(1./255),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the deep_model
deep_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(deep_model.summary())
# 784*256+256
# 256*256+256
# 256*10+10

In [None]:
history_deep = deep_model.fit(
    X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=512,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=2)]
)

In [None]:
plot_history(history_deep.history)

### Convolution with keras

Convolutional layers are designed to operate on images or time-series data with multiple sensors, so they expect input data to have a channel dimension. In our case, since we are dealing with grayscale images, we will have only one channel (unlike the RGB channels in color images). To ensure our data has this channel dimension, we need to reshape our data first. Let's collect all the preprocessing steps, and create new preprocessed sets.

In [None]:
from keras.layers import Reshape

preprocess = Sequential([
    Reshape(target_shape=(X_train.shape[1], X_train.shape[2], 1)),  # explicitly state the 4th (channel) dimension
    Rescaling(1./255)
])

X_sets = [X_train, X_test, X_val]
X_train_4D, X_test_4D, X_val_4D = [preprocess(X) for X in X_sets]

In [None]:
X_train.shape

In [None]:
X_train_4D.shape

In [None]:
from keras.layers import Conv2D, MaxPooling2D

# Build the model
model_cnn = Sequential([
    Input(shape=X_train_4D.shape[1:]),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model_cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_cnn.summary())
# 32*(3*3+1)
# 64*(32*3*3+1) -- bias term is not channel-specific as it does not depend on the input data
# (64*5*5)*10+10

In [None]:
# Fit the model
history_cnn = model_cnn.fit(
    X_train_4D, y_train, validation_data=(X_val_4D, y_val), epochs=50, batch_size=512,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=2)]
)

In [None]:
plot_history(history_cnn.history)

### Other useful tricks: data augmentation

In [None]:
from keras.layers import RandomRotation, RandomZoom

# TODO: Create a new Sequential model called data augmentation with two layers: 
    # RandomRotation(0.1, fill_mode="nearest") & RandomZoom(0.2, fill_mode="nearest")

# Look at the transformations
number_of_digits_to_show = 5
fig, axs = plt.subplots(2, number_of_digits_to_show, figsize=(2*number_of_digits_to_show, 4))
for _ in range(number_of_digits_to_show):
    axs[0, _].imshow(X_train_4D[_], cmap="binary")
    axs[0, _].axis("off")
    axs[1, _].imshow(data_augmentation(X_train_4D[_]), cmap="binary")
    axs[1, _].axis("off")
plt.tight_layout()
plt.show()

In [None]:
model_cnn_da = Sequential([
    data_augmentation,
    model_cnn
])

print(model_cnn_da.summary())

In [None]:
# Compile the model
model_cnn_da.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history_cnn_da = model_cnn_da.fit(
    X_train_4D, y_train, validation_data=(X_val_4D, y_val), epochs=50, batch_size=512,
    callbacks=[EarlyStopping(monitor='val_accuracy', patience=5)] # increase patience for slower learning
)

### Evaluate our final model on the test set

In [None]:
# TODO

## Transfer learning

In [None]:
from keras.utils import load_img, img_to_array
from keras.applications.resnet50 import ResNet50

pretrained_model = ResNet50(weights='imagenet')
print(pretrained_model.summary())

In [None]:
img_path = #TODO
img = load_img(img_path, target_size=(224, 224))
img

In [None]:
x = img_to_array(img)
x.shape

In [None]:
# make sure it is one sample with 3 dimensions
x = np.expand_dims(x, axis=0)
x.shape

In [None]:
preds = pretrained_model.predict(x)
preds.shape

In [None]:
from keras.applications.resnet50 import decode_predictions
decoded_preds = decode_predictions(preds, top=3)
[(i[1], round(i[2], 4)) for i in decoded_preds[0]]

We can fine-tune these pre-trained models for our purposes by modifying the last few layers, and learn only the new parameters on new data (freezing the weights of the original network). See e.g [this tutorial](https://pyimagesearch.com/2020/04/27/fine-tuning-resnet-with-keras-tensorflow-and-deep-learning/) for more details.