In [3]:
import numpy as np
import tensorflow as tf
# sklearn is a python library used for data mining/ML and analysis. We will use it minimally.
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# used only for visualization:
import pandas as pd # a python library used for data analysis
import matplotlib as mpl # a python library used for creating visuals
import matplotlib.pyplot as plt

# 1. Load the data

In [4]:
# Load the dataset from sklearn
data = load_diabetes()
X, y = data.data, data.target

# 1.1. Understanding the dataset
- How many individual data points are there?
- What are the variables?
- Learn more here: https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset
    - When in doubt, remember RTFM: https://www.wikiwand.com/en/RTFM

In [None]:
print(f'Number of data points: {len(X)}')
print(f'Number of positive diabetes results: {np.sum(y > np.median(y))}')
#print(X)
#print(y)

# Convert the dataset to a Pandas DataFrame to display a correlation matrix
feature_names = data.feature_names
print(f'Features: {feature_names}')

# Put into pandas dataframe simply to create a correlation matrix
df = pd.DataFrame(data=np.c_[X, y], columns=feature_names + ['target'])

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Plot the correlation matrix as a heatmap using Matplotlib
plt.figure(figsize=(8, 6))

norm = mpl.colors.TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1)

plt.imshow(correlation_matrix, cmap='seismic', interpolation='nearest', norm=norm)
plt.colorbar()

plt.title('Correlation Matrix')
plt.xticks(range(len(correlation_matrix)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix)), correlation_matrix.columns)
plt.show()

# 2. Preprocess the data
Clean and modify data so that it's ready to be fed into the neural network for training.

Here we will split the data into training and testing sets, then normalize the data.
- To "Normalize" the data is to change the input features such that they are on the same scale. In this case, we make all the means = 0 and std dev = 1
    - This "stablizes" the NN, allowing for easier convergence and therefore performance.

In [5]:
# Convert target to binary labels (0 or 1)
y_binary = (y > np.median(y)).astype(np.float32)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Normalize the features
scaler = tf.keras.layers.experimental.preprocessing.Normalization()
X_train = scaler(X_train)
X_test = scaler(X_test)

#2.1. Perform sanity checks
- How much data has been split?
- Make sure output feature is correctly 0 or 1
- Whatever else you're curious about!

In [None]:
print(f'Actual split ratio: {len(X_test) / len(X_train)}')
print(len(X_train))

# 3. Build and compile the model
Here we define what the neural network structure is - i.e., the architecture.

Then we choose the optimizer and loss function that will be used during the training process.
- An **optimizer** is the algorithm that adjusts the parameters of a NN during training. Gradient descent is the basic optimization algo.
- A **loss function** is the way we measure how well the model's predictions match the known target/labels of the training set.
  - "Binary cross entropy" measures the difference between the predicted NN output at the known binary labels. There are other losses for non-binary output (we will see this below!)
- An **activation function** is ...

In [None]:
# Create the TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

**Question:** What does this model summary actually show us?

The NN has 3 layers:
- one input layer
- one hidden layer (with 16 neurons)
- one output layer (with 1 neuron).

The "params" values indicate the number of trainable parameters in each layer (i.e. the weights and biases that the model learns during training).

10(16) weights + 1(16) biases = 176

In [None]:
# Let's actually look at what the neural network is (look at the weights and biases)

# Print the weights and biases of each layer
for layer in model.layers:
    if hasattr(layer, 'weights'):
        weights, biases = layer.get_weights()
        print("Layer:", layer.name)
        print("Weights:")
        print(weights)
        print("Biases:")
        print(biases)

# 4. Train the model
Here we repeatedly send data through the NN, calculate how wrong the results are (the loss), then update the NN parameters via backpropagation to decrease the loss next iteration.
- **Iteration** is a single update of the models parameters using 1 batch of training data
- **Batch size** is the number of data samples that are processed together for 1 iteration.
- **Epoch** is one complete pass through the entire training dataset. The number of iterations per epoch is dependant on the batch size.


e.g. If a training set has 1000 samples, and we use a batch size of 10, then there will be 100 iterations per epoch.
- The higher the batch size, the fast the training generally (but you use more memory and will eventually decrease model performance). There is a sweet spot you have to find through experiments!

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=300, batch_size=256, validation_split=0.1)

Now that the model has been trained, let's look at the weights and biases again.

In [None]:
# Print the weights and biases of each layer
for layer in model.layers:
    if hasattr(layer, 'weights'):
        weights, biases = layer.get_weights()
        print("Layer:", layer.name)
        print("Weights:")
        print(weights)
        print("Biases:")
        print(biases)

# 5. Evaluate the trained NN
Here we check how well our model performs on the test set (the 20% we set aside and haven't touched until now).

We will also plot the loss and accuracy for both the training and validation set throughout the training process.


In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

fig,ax=plt.subplots(1,2,figsize=(8,4))
ax[0].plot(history.history['loss'],label='train')
ax[0].plot(history.history['val_loss'],label='val')
ax[0].legend()
ax[0].set_ylabel('loss')
ax[1].plot(history.history['accuracy'],label='train')
ax[1].plot(history.history['val_accuracy'],label='val')
ax[1].legend()
ax[1].set_ylabel('acc')
plt.tight_layout()

Loss vs Accuracy:

Loss measures how well the predictions match the known target/label (many ways of measuring -> e.g. binary cross entropy)

Accuracy is the actual % of correct predictions.

# 6. Experimentation
We can now conduct a couple experiments/modifications to see if it's possible to improve our model accuracy.

# 6.1. Experiment \#1: Learning Rate
Learning rate, α, is a hyperparameter that determines the step size taken by the optimization algorithm while updating the model's parameters during training.
- Larger learning rate mean bigger updates to the parameters after each iteration (pro: faster training, con: risks overshooting and instability)


Note: the Adam optimizer is an adaptive learning rate optimization algorithm that dynamically adjusts the learning rate during training based on the historical gradients of the parameters.

In [None]:
# Learning rates to experiment with
learning_rates = [0.0001, 0.001, 0.01, 0.1, 1]

# Dictionary to store training and validation accuracies for each learning rate
accuracy_history = {}

# Loop over each learning rate and train a model
for lr in learning_rates:
    print(f"Training model with learning rate: {lr}")
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # Compile the model with the current learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=300, batch_size=256, validation_split=0.1, verbose=0)

    # Store training and validation accuracies for this learning rate
    accuracy_history[lr] = {
        'train_accuracy': history.history['accuracy'],
        'val_accuracy': history.history['val_accuracy']
    }

Training model with learning rate: 0.0001
Training model with learning rate: 0.001
Training model with learning rate: 0.01
Training model with learning rate: 0.1
Training model with learning rate: 1


In [None]:
# Plot the accuracy curves for each learning rate
plt.figure(figsize=(10, 6))
for lr, accuracies in accuracy_history.items():
    plt.plot(accuracies['train_accuracy'], label=f'Train LR={lr}')

plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy for Different Learning Rates')
plt.legend()
plt.grid(True)
plt.show()

#Experiment \#2: Model Architecture

Looking back at the correlation matrix, we can see that the "sex" input feature has the least impact on the target. Therefore, let's try dropping that from the input data.

In [None]:
print(X_train)

In [None]:
# Remove 1st feature from the input data
X_train = np.delete(X_train, [1], axis=1) # axis=1 means we're deleting column 1 , not rows
X_test = np.delete(X_test, [1], axis=1)

In [None]:
# Sanity check!
print(len(X_train[0]))

In [None]:
# New architecture with Flatten and Dropout layers
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(16, activation='relu'),
    #tf.keras.layers.Dropout(0.3),                  # Add Dropout layer with 30% dropout rate
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=300, batch_size=256, validation_split=0.1)

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# plot accur for train and val sets
fig,ax=plt.subplots(1,2,figsize=(8,4))
ax[0].plot(history.history['loss'],label='train')
ax[0].plot(history.history['val_loss'],label='val')
ax[0].legend()
ax[0].set_ylabel('loss')
ax[1].plot(history.history['accuracy'],label='train')
ax[1].plot(history.history['val_accuracy'],label='val')
ax[1].legend()
ax[1].set_ylabel('acc')
plt.tight_layout()

# Exercise 2
Dataset: California Housing Dataset
- https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset

Input: age of house, avg income in area, location, etc.

Output: Cost of House (a continuous value -> regression)

In [13]:
import tensorflow as tf
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [14]:
# Load the data
data = fetch_california_housing()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Normalization layer
normalizer = tf.keras.layers.experimental.preprocessing.Normalization()
normalizer.adapt(X_train)

# Normalize the features
X_train = normalizer(X_train)
X_test = normalizer(X_test)

In [15]:
# Build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

Here we see the use of "mean squared error" as the loss function.

**Mean squared error** measures the difference between predicted output and known output, penalizing larger error/difference more.

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

In [None]:
# Evaluate the model on the test set
y_pred = model.predict(X_test).squeeze()

# Calculate the mean squared error
mse = tf.reduce_mean(tf.square(y_test - y_pred))
print("Mean Squared Error:", mse.numpy())

# Plot results
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
# let's also show the y=x line, which is where all the data would be if the model was perfect
plt.plot([0,5],[0,5],ls='--',c='k')
plt.show()

Note that "accuracy" like we used before isn't actually a good way of analysing the model, since the output is continuous.