In [None]:
import numpy as np
import math # Added for math.floor
import matplotlib.pyplot as plt # Added for plotting
import pandas as pd # Added for DataFrame
# Assuming dummy files are in the same directory or path is configured
from Loss import compute_BCE_loss # For compute_BCE_loss
from Predict import predict # For predict function
# from Dense import Dense # Not strictly needed for this cell if Model uses it internally
# from Activation import Activation # Not strictly needed for this cell if Model uses it internally

output = {} # Initialize the output dictionary globally

def zero_pad(X, pad):
    """
    Pad all images in the dataset X with zeros.
    The padding should be applied to both the height and width of each image.

    Argument:
    X -- python numpy array of shape (m, n_H, n_W, n_C), where m represent the number of examples.
    pad -- integer, amount of padding around each image on vertical and horizontal dimensions

    Returns:
    X_pad -- padded image of shape (m, n_H + 2*pad, n_W + 2*pad, n_C)
    """

    ### START CODE HERE ###
    X_pad = np.pad(X, ((0,0), (pad,pad), (pad,pad), (0,0)), mode='constant', constant_values = (0,0))
    ### END CODE HERE ###

    return X_pad

In [None]:
def conv_single_step(a_slice_prev, W, b):
    """
    Arguments:
    a_slice_prev -- slice of previous activation layer output with shape (filter_size, filter_size, n_C_prev)
    W -- Weight parameters contained in a window - matrix of shape (filter_size, filter_size, n_C_prev)
    b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)

    Returns:
    Z -- a scalar value, result of convolving the sliding window (W, b) on a slice x of the input data
    """

    ### START CODE HERE ###
    # Step 1: Element-wise product to a_slice_prev and W
    s = a_slice_prev * W
    # Step 2: Sum all values to get a single scalar
    Z = np.sum(s)
    # Step 3: Add the bias
    Z = Z + float(b)
    ### END CODE HERE ###

    return Z

In [None]:
class Conv():
    def __init__(self, filter_size=2, input_channel=3, output_channel=8, pad=1, stride=1, seed=1):
        self.filter_size = filter_size
        self.input_channel = input_channel
        self.output_channel = output_channel
        self.seed = seed # seed is used in initialize_parameters
        self.pad = pad
        self.stride = stride
        self.parameters = {'W':None, 'b': None}
        self.initialize_parameters()
        # Initialize dW and db as well, as they are used in update and set in backward
        self.dW = None
        self.db = None
        self.cache = None # cache is used in backward and set in forward

    def initialize_parameters(self):
        np.random.seed(self.seed) # Uses self.seed
        sd = np.sqrt(6.0 / (self.input_channel * self.filter_size * self.filter_size + self.output_channel * self.filter_size * self.filter_size)) # Corrected Xavier/He init for conv
        W = np.random.uniform(-sd, sd, (self.filter_size,self.filter_size,self.input_channel,self.output_channel))
        # W = np.random.randn(self.filter_size,self.filter_size,self.input_channel,self.output_channel) * np.sqrt(2./(self.filter_size*self.filter_size*self.input_channel)) # Alternative He init
        b = np.zeros((1, 1, 1, self.output_channel))
        assert(W.shape == (self.filter_size,self.filter_size,self.input_channel,self.output_channel))
        assert(b.shape == (1,1,1,self.output_channel))
        self.parameters['W'] = W
        self.parameters['b'] = b

Conv.conv_single_step = conv_single_step

In [None]:
def forward(self, A_prev):
    """ Implements the forward propagation for a convolution layer

    Arguments:
    A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)

    Returns:
    Z -- conv output, numpy array of shape (m, n_H, n_W, n_C)
    """

    ### START CODE HERE ###
    # Retrieve dimensions from A_prev's shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    # Retrieve dimensions from W's shape
    # W has shape (filter_size, filter_size, input_channel, output_channel)
    # self.parameters['W'] has shape (f, f, n_C_prev_from_W, n_C_from_W)
    (f, _, n_C_prev_from_W, n_C) = self.parameters['W'].shape 
    assert n_C_prev == n_C_prev_from_W, f"Number of input channels in A_prev ({n_C_prev}) and W ({n_C_prev_from_W}) must match"
    assert f == self.filter_size, f"Filter size in W ({f}) and self.filter_size ({self.filter_size}) must match"
    assert n_C == self.output_channel, f"Output channels in W ({n_C}) and self.output_channel ({self.output_channel}) must match"

    # Step 1: Output Dimension Calculation
    pad = self.pad
    stride = self.stride
    n_H = int(np.floor((n_H_prev - f + 2 * pad) / stride)) + 1
    n_W = int(np.floor((n_W_prev - f + 2 * pad) / stride)) + 1

    # Initialize the output volume Z with zeros
    Z = np.zeros((m, n_H, n_W, n_C))

    # Step 2: Padding
    A_prev_pad = zero_pad(A_prev, pad) # use zero_pad function

    # Step 3: Loop Through Training Examples
    for i in range(m): # loop over the batch of training examples
        a_prev_pad_example = A_prev_pad[i] # Select the i-th example's padded activation
        for h in range(n_H): # loop over vertical axis of the output volume
            for w in range(n_W): # loop over horizontal axis of the output volume
                for c_out in range(n_C): # loop over channels (= #filter) of the output volume

                    # Step 3-1: Extracting slices
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice_prev = a_prev_pad_example[vert_start:vert_end, horiz_start:horiz_end, :] # Slice all input channels for this slice

                    # Step 3-2: Applying Filters
                    # Use self.conv_single_step(), parameters W and b for the c_out-th filter
                    weights = self.parameters['W'][:,:,:,c_out]
                    biases = self.parameters['b'][:,:,:,c_out]
                    Z[i, h, w, c_out] = self.conv_single_step(a_slice_prev, weights, biases)
    ### END CODE HERE ###
    # Making sure your output shape is correct
    assert(Z.shape == (m, n_H, n_W, n_C))

    # Save information in "cache" for the backward pass
    self.cache = A_prev

    return Z

Conv.forward = forward

In [None]:
def backward(self, dZ):
    """ Implement the backward propagation for a convolution layer

    Arguments:
    dZ -- gradient of the cost with respect to the output of the conv layer (Z), numpy array of shape (m, n_H, n_W, n_C)

    Returns:
    dA_prev -- gradient of the cost with respect to the input of the conv layer (A_prev),
                numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
    """

    A_prev = self.cache

    ### START CODE HERE ###

    # Retrieve dimensions from A_prev's shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    # Retrieve dimensions from W's shape
    (f, _, _, n_C) = self.parameters['W'].shape # W has shape (f, f, n_C_prev, n_C)

    # Retrieve stride and pad from self
    stride = self.stride
    pad = self.pad

    # Retrieve dimensions from dZ's shape
    (m_dZ, n_H, n_W, n_C_dZ) = dZ.shape # n_C here is the number of filters, should match n_C from W
    assert n_C == n_C_dZ, f"Number of channels in dZ ({n_C_dZ}) must match output channels in W ({n_C})"
    
    # Initialize Gradients
    dA_prev = np.zeros_like(A_prev) # Shape: (m, n_H_prev, n_W_prev, n_C_prev)
    dW = np.zeros_like(self.parameters['W']) # Shape: (f, f, n_C_prev, n_C)
    db = np.zeros_like(self.parameters['b']) # Shape: (1, 1, 1, n_C)

    # Padding A_prev and dA_prev
    A_prev_pad = zero_pad(A_prev, pad)
    dA_prev_pad = zero_pad(dA_prev, pad) # Pad dA_prev for accumulating gradients

    # Loop Through Training Examples
    for i in range(m):                         # loop over the batch of training examples
        # Select ith training example's padded activation
        a_prev_pad = A_prev_pad[i]
        # Select ith training example's padded dA_prev (this will be updated)
        da_prev_pad_example = dA_prev_pad[i] 

        for h in range(n_H):                   # loop over vertical axis of the output volume
            for w in range(n_W):               # loop over horizontal axis of the output volume
                for c_filter in range(n_C):           # loop over the channels of the output volume (number of filters)

                    # Extracting Slices
                    vert_start = h * stride
                    vert_end = vert_start + f
                    horiz_start = w * stride
                    horiz_end = horiz_start + f
                    a_slice = a_prev_pad[vert_start:vert_end, horiz_start:horiz_end, :] # shape (f, f, n_C_prev)

                    # Update Gradients
                    # dZ[i, h, w, c_filter] is a scalar
                    # self.parameters['W'][:,:,:,c_filter] has shape (f, f, n_C_prev)
                    da_prev_pad_example[vert_start:vert_end, horiz_start:horiz_end, :] += self.parameters['W'][:,:,:,c_filter] * dZ[i, h, w, c_filter]
                    dW[:,:,:,c_filter] += a_slice * dZ[i, h, w, c_filter]
                    db[:,:,:,c_filter] += dZ[i, h, w, c_filter]
        
        # After processing all h, w, c_filter for the current example i, update the main dA_prev_pad array
        # This line was dA_prev_pad[i] = da_prev_pad_example, which is redundant as da_prev_pad_example IS dA_prev_pad[i]
        # No explicit assignment needed here as da_prev_pad_example is a reference to dA_prev_pad[i]'s content

        # Remove Padding from dA_prev_pad for the current example and store in dA_prev
        if pad > 0:
            dA_prev[i, :, :, :] = dA_prev_pad[i, pad:-pad, pad:-pad, :]
        else:
            dA_prev[i, :, :, :] = dA_prev_pad[i, :, :, :] # No padding to remove

    ### END CODE HERE ###

    assert(dA_prev.shape == (m, n_H_prev, n_W_prev, n_C_prev))

    # Store gradients in self
    self.dW = dW
    self.db = db

    return dA_prev

Conv.backward = backward

In [None]:
def update(self, learning_rate):
    """ Update parameters using gradient descent

    Arguments:
    learning_rate -- step size
    """

    ### START CODE HERE ###
    self.parameters["W"] = self.parameters["W"] - learning_rate * self.dW
    self.parameters["b"] = self.parameters["b"] - learning_rate * self.db
    ### END CODE HERE ###

Conv.update = update

In [None]:
class MaxPool():
    def __init__(self, pool_size=2, stride=2):
        self.pool_size = pool_size
        self.stride = stride
        self.cache = None

    def create_mask_from_window(self, x):
        """ Creates a mask from an input matrix x, to identify the max entry.

        Arguments:
        x -- Array of shape (pool_size, pool_size)

        Returns:
        mask -- Array of the same shape as window, contains a True at the position corresponding to the max entry of x.
        """
        ### START CODE HERE ###
        mask = (x == np.max(x))
        ### END CODE HERE ###
        return mask

In [None]:
def forward(self, A_prev):
    """ Implements the forward pass of the max pooling layer

    Arguments:
    A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)

    Returns:
    A -- output of the pool layer, a numpy array of shape (m, n_H, n_W, n_C)
    """

    ### START CODE HERE ###
    # retrieve dimensions from the input shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    # Retrieve pool_size and stride from self
    pool_size = self.pool_size
    stride = self.stride

    # Step 1: Output Dimension Calculation
    n_H = int(np.floor((n_H_prev - pool_size) / stride)) + 1
    n_W = int(np.floor((n_W_prev - pool_size) / stride)) + 1
    n_C = n_C_prev # n_C is the same as n_C_prev

    # initialize output matrix A with zeros
    A = np.zeros((m, n_H, n_W, n_C))

    # Step 2: Loop Through Training Examples
    for i in range(m):                           # loop over the batch of training examples
        for h in range(n_H):                     # loop on the vertical axis of the output volume
            for w in range(n_W):                 # loop on the horizontal axis of the output volume
                for c in range (n_C):            # loop over the channels of the output volume

                    # Step 2-1: Extracting slices
                    vert_start = h * stride
                    vert_end = vert_start + pool_size
                    horiz_start = w * stride
                    horiz_end = horiz_start + pool_size
                    a_prev_slice = A_prev[i, vert_start:vert_end, horiz_start:horiz_end, c]

                    # Step 2-2: Applying Maxpooling
                    A[i, h, w, c] = np.max(a_prev_slice)

    ### END CODE HERE ###

    # Store the input in "cache" for backward pass
    self.cache = A_prev

    # Making sure your output shape is correct
    assert(A.shape == (m, n_H, n_W, n_C))

    return A

MaxPool.forward = forward

In [None]:
def backward(self, dA):
    """ Implements the backward pass of the max pooling layer

    Arguments:
    dA -- gradient of cost with respect to the output of the pooling layer, same shape as A

    Returns:
    dA_prev -- gradient of cost with respect to the input of the pooling layer, same shape as A_prev
    """

    # Retrieve information from cache
    A_prev = self.cache
    pool_size = self.pool_size # Added
    stride = self.stride       # Added

    ### START CODE HERE ###

    # Retrieve dimensions from A_prev's shape and dA's shape
    (m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape # Corrected
    (m_dA, n_H, n_W, n_C) = dA.shape # Corrected variable names, m should be same, n_C should be same as n_C_prev
    assert m == m_dA, "Number of examples in A_prev and dA must be the same"
    assert n_C_prev == n_C, "Number of channels in A_prev and dA must be the same"

    # Step 1: Initialize Gradients
    dA_prev = np.zeros_like(A_prev) # Corrected

    # Step 2: Loop Through Training Examples
    for i in range(m_dA): # loop over the batch of training examples, use m from dA
        a_prev_example = A_prev[i] # Select the i-th example from A_prev
        for h in range(n_H):                     # loop on the vertical axis of the output volume
            for w in range(n_W):                 # loop on the horizontal axis of the output volume
                for c_loop in range (n_C):            # loop over the channels of the output volume (use c_loop to avoid conflict with n_C)

                    # Step 2-1: Extracting slices from a_prev_example
                    vert_start = h * stride
                    vert_end = vert_start + pool_size
                    horiz_start = w * stride
                    horiz_end = horiz_start + pool_size
                    a_prev_slice = a_prev_example[vert_start:vert_end, horiz_start:horiz_end, c_loop]

                    # Step 2-2: Pass through the Gradients
                    mask = self.create_mask_from_window(a_prev_slice) # Use self.create_mask_from_window
                    dA_prev[i, vert_start:vert_end, horiz_start:horiz_end, c_loop] += mask * dA[i, h, w, c_loop]

    ### END CODE HERE ###

    # Make sure your output shape is correct
    assert(dA_prev.shape == A_prev.shape)

    return dA_prev

MaxPool.backward = backward

In [None]:
class Flatten():
    def __init__(self):
        self.cache = None

In [None]:
def forward(self, A_prev):
    """ Implements the forward pass of the flatten layer

    Arguments:
    A_prev -- Input data, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)

    Returns:
    A -- output of the flatten layer, a 2-dimensional array of shape (m, (n_H_prev * n_W_prev * n_C_prev))
    """

    # Save information in "cache" for the backward pass
    self.cache = A_prev.shape

    ### START CODE HERE ###
    A = A_prev.reshape(A_prev.shape[0], -1)
    ### END CODE HERE ###
    return A

Flatten.forward = forward

In [None]:
def backward(self, dA):
    """ Implements the backward pass of the flatten layer

    Arguments:
    dA -- Input data, a 2-dimensional array

    Returns:
    dA_prev -- An array with its original shape (the output shape of its' previous layer).
    """
    ### START CODE HERE ###
    dA_prev = dA.reshape(self.cache)
    ### END CODE HERE ###
    return dA_prev

Flatten.backward = backward

In [None]:
class Model():
    def __init__(self):
        self.layers=[]

    def add(self, layer):
        self.layers.append(layer)

In [None]:
def forward(self, X):
    A = X

    ### START CODE HERE ###
    for l in range(len(self.layers)):
        current_layer = self.layers[l] # Get current layer
        A = current_layer.forward(A)   # Call forward method of the current layer
    ### END CODE HERE ###
    return A

Model.forward = forward

In [None]:
def backward(self, AL=None, Y=None):
    L = len(self.layers)
    # Y = Y.reshape(AL.shape) # Ensure Y and AL have the same shape if not already

    ### START CODE HERE ###
    # Initial gradient calculation for the output layer (dAL)
    # This is dL/dAL, where L is the BCE loss
    # Ensure Y has the same shape as AL for element-wise operations
    if Y.shape != AL.shape:
        Y = Y.reshape(AL.shape)
        
    dAL = - (np.divide(Y, AL, out=np.zeros_like(AL), where=AL!=0) - \
             np.divide(1 - Y, 1 - AL, out=np.zeros_like(AL), where=(1-AL)!=0))


    # Backward pass for the last layer
    current_layer = self.layers[L-1]
    # For an Activation layer (e.g., sigmoid), its backward method takes dAL (dL/d(output of activation))
    # and computes dZ (dL/d(input of activation), which is output of previous Dense layer for example)
    dA_prev = current_layer.backward(dAL) # dA_prev here is actually dZ for the last activation layer, or dA for a layer like Flatten if it's last.

    # Loop from l=L-2 to l=0 (i.e., second to last layer down to the first layer)
    for l in reversed(range(L-1)): # L-1 is the index of the last layer, so L-2 is the one before it
        current_layer = self.layers[l]
        # The dA_prev from the (l+1)-th layer's backward pass becomes the dA for the l-th layer's backward pass
        dA_prev = current_layer.backward(dA_prev)
    ### END CODE HERE ###

    return dA_prev

Model.backward = backward

In [None]:
def update(self, learning_rate):

    # Only convolution layer and dense layer have to update parameters
    ### START CODE HERE ###
    for l in range(len(self.layers)):
      current_layer = self.layers[l]
      if hasattr(current_layer, 'update'): # Check if the layer has an update method
          current_layer.update(learning_rate)
    ### END CODE HERE ###

Model.update = update

## 4. Application in a real case

Now, you have all the necessary functions to build your own ConvNet. In this final part, you will get to implement a ConvNet that classifies images from the dataset.

### 4.1 Read the data

In [None]:
# Load the data from the .npz file
### START CODE HERE ###
# Note: Replace 'Lab5_data.npz' with the actual filename if it's different.
# Note: Adjust the keys ('X_train', 'y_train', 'X_test') if they are different in your .npz file.
data = np.load('Lab5_data.npz') 
X_train = data['X_train']
y_train = data['y_train']
X_test = data['X_test'] # Or use the correct key for test images, e.g., 'X_test_images'
### END CODE HERE ###

# Example: Print shapes to verify
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)
# print("X_test shape:", X_test.shape)

### 4.2 mini-batch gradient descent

In [None]:
def random_mini_batches(X, Y, mini_batch_size = 64):
    """
    Creates a list of random minibatches from (X, Y)

    Arguments:
    X -- input data, of shape (number of examples, n_H, n_W, n_C) for images
    Y -- true "label" vector, of shape (number of examples, 1)
    mini_batch_size -- size of the mini-batches, integer

    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """

    m = X.shape[0]  # number of training examples
    mini_batches = []
    # np.random.seed(seed) # Seed is usually set globally if needed for reproducibility

    ### START CODE HERE ###

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation,:,:,:]
    shuffled_Y = Y[permutation,:]

    # Step 2 - Partition (shuffled_X, shuffled_Y).
    num_complete_minibatches = math.floor(m / mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : (k * mini_batch_size) + mini_batch_size, :,:,:]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : (k * mini_batch_size) + mini_batch_size, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # For handling the end case (last mini-batch < mini_batch_size i.e less than 64)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size : m, :,:,:]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size : m, :]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    ### END CODE HERE ###
    return mini_batches

### 4.3 Start training

In [None]:
# Assuming Dense and Activation are imported if they are in separate .py files
# from Dense import Dense
# from Activation import Activation

### START CODE HERE ###
learning_rate = 3e-5
num_iterations = 20 
batch_size = 64    
costs = []         

model = Model()
# Input images are 32x32x1 (as per problem description)
model.add(Conv(filter_size=3, input_channel=1, output_channel=8, pad=1, stride=1, seed=1))
model.add(Activation("relu", None))
model.add(MaxPool(pool_size=2, stride=2))
# After MaxPool (2x2, stride 2) on 32x32, output is 16x16x8
# model.add(Conv(filter_size=3, input_channel=8, output_channel=16, pad=1, stride=1, seed=2))
# model.add(Activation("relu", None))
# model.add(MaxPool(pool_size=2, stride=2))
# After second MaxPool (2x2, stride 2) on 16x16, output is 8x8x16
model.add(Flatten()) 
# Flattened output: 16*16*8 = 2048 if one Conv block
# Flattened output: 8*8*16 = 1024 if two Conv blocks
# The Dense layer's input_size must match this. The Dense class from Lab4 might infer this.
# We specify number of units in this dense layer (e.g. 128) and output units (1 for sigmoid).
model.add(Dense(128, 1)) # 128 units in this Dense layer, 1 output unit for sigmoid
model.add(Activation("sigmoid", None))


# Training loop
for i in range(num_iterations):
    print("epoch: ",i)
    # Ensure X_train and y_train are available from the data loading cell (4.1)
    # And that y_train is in the correct shape, e.g. (m, 1) or (1, m) as expected by compute_BCE_loss and model.backward
    # The random_mini_batches expects Y of shape (m, 1)
    # compute_BCE_loss expects Y and AL of shape (1, m) or (m, 1) if consistent.
    # Our Model.backward expects Y and AL to be (m, n_classes) or (1,m) for BCE. Let's assume (1,m) for Y to match typical BCE.
    # If y_train is (m,1), it needs to be reshaped to (1,m) for compute_BCE_loss and Model.backward's dAL calculation.
    # However, random_mini_batches returns y_batch as (batch_size, 1). So AL from model.forward(x_batch) should also be (batch_size, 1).
    # Let's adjust compute_BCE_loss in Loss.py to handle Y (batch_size,1) and AL (batch_size,1) by transposing them before np.dot if needed.
    # Or, ensure AL from Dense layer is (batch_size, 1) and Y is (batch_size, 1).
    # The current Loss.py expects (1,m). The Dense layer from lab4 likely produces (output_units, m).
    # For consistency, we assume model.forward returns (num_classes, m) and Y is (num_classes, m).
    # If y_train loaded is (m,), reshape to (1,m). If (m,1) also reshape to (1,m).
    # For this lab, let's assume y_train is (m,1) and model output AL is (m,1) after Dense(...,1). Then Loss function needs to handle this.
    # The provided Loss.py is: cost = (1./m) * (-np.dot(Y,np.log(AL).T) - np.dot(1-Y, np.log(1-AL).T)). Y and AL (1,m)
    # If AL is (m,1) and Y_batch is (m,1) from random_mini_batches, then this is - (1/m) * sum(Y*log(AL) + (1-Y)*log(1-AL))
    # Let's assume random_mini_batches returns y_batch as (mini_batch_size, 1) and AL from model as (mini_batch_size, 1)
    # And that compute_BCE_loss is adjusted or expects this. (The dummy one expects (1,m))
    # For simplicity, let's make sure y_batch and AL are (1, mini_batch_size) for the loss function.

    mini_batches = random_mini_batches(X_train, y_train, batch_size)
    epoch_cost = 0. # Cost for the current epoch, can be sum or average over batches
    num_batches = len(mini_batches)

    for batch in mini_batches:
        (x_batch, y_batch) = batch # x_batch (bs,H,W,C), y_batch (bs,1)

        # Forward pass
        AL = model.forward(x_batch) # Expect AL to be (bs, 1) from Dense(...,1) and Sigmoid

        # Compute cost
        # Ensure y_batch and AL are compatible with compute_BCE_loss
        # compute_BCE_loss expects (1,m). AL might be (bs,1) or (1,bs). y_batch is (bs,1).
        # Reshape y_batch to (1, bs) and AL to (1, bs) if it's (bs,1)
        y_batch_reshaped = y_batch.T # from (bs,1) to (1,bs)
        AL_reshaped = AL.T       # from (bs,1) to (1,bs) if Dense output is (bs, units)
                                 # If Dense output is (units, bs) then AL is already (1,bs)
                                 # Let's assume model.forward() for Dense layer returns (units, m) as per Lab4
                                 # So AL here would be (1, batch_size)
        current_batch_cost = compute_BCE_loss(y_batch_reshaped, AL) # AL is assumed (1, bs)
        epoch_cost += current_batch_cost

        # Backward pass
        dA_prev = model.backward(AL, y_batch_reshaped) # model.backward expects AL (1,bs), Y (1,bs)

        # Update parameters
        model.update(learning_rate)
    
    # Print the cost (average cost over batches for the epoch)
    epoch_avg_cost = epoch_cost / num_batches
    print ("Cost after iteration %i: %f" %(i, epoch_avg_cost))
    costs.append(epoch_avg_cost)

### END CODE HERE ###

### 4.4 Evaluate your model

In [None]:
### START CODE HERE ###
# Plot the cost
plt.plot(np.squeeze(costs)) # costs is a list of scalar epoch_avg_cost, squeeze is robust
plt.ylabel('cost')
plt.xlabel('epochs') # Changed from 'iterations' to 'epochs' for clarity as num_iterations means epochs
plt.title("Learning rate =" + str(learning_rate))
plt.show()

# Evaluate on training data
print('training------')
# predict function from Predict.py is expected to print accuracy if y is provided
pred_train = predict(model, X_train, y_train) 
### END CODE HERE ###

### 4.5 Generate the prediction

In [None]:
# Ensure model and X_test are available from previous steps
# X_test should be loaded in section 4.1
# model should be trained in section 4.3

### START CODE HERE ###
pred_test = predict(model, X_test) # predict function from Predict.py

# Ensure pred_test is in a suitable format (e.g., (N, 1) or (N,))
# and contains 0s or 1s.
# The dummy predict function returns p as (1,m) with integer predictions.
# So pred_test.astype(int).flatten() should work fine.

df = pd.DataFrame({
    'ID': range(len(X_test)),  # Assumes X_test is loaded and len(X_test) gives num samples
    'Label': pred_test.astype(int).flatten() # Cast to int and flatten
})
df.to_csv('Lab5_prediction.csv', index=False, mode='w')

print("Lab5_prediction.csv generated.")
### END CODE HERE ###

## 5. Generate Lab5_output.npy

In [None]:
# Ensure the 'output' dictionary has been populated by previous "Test and Evaluate" cells.
# This script part just saves it and does a sanity check.

### START CODE HERE ###
np.save("Lab5_output.npy", output) # output should be the dictionary

# sanity check
submit = np.load("Lab5_output.npy", allow_pickle=True).item()
for key, value in submit.items():
    print(str(key) + ": " + str(type(value)))

# Expected keys for the assertion:
expected_keys = [
    'zero_padding', 'conv_single_step', 'conv_forward_1', 'conv_forward_2', 
    'conv_forward_3','conv_backward_1', 'conv_backward_2', 'conv_backward_3', 
    'conv_update_1', 'conv_update_2', 'maxpool_forward', 'maxpool_backward', 
    'flatten_forward', 'flatten_backward', 'model_1', 'model_2', 'model_3', 'model_4'
]
assert(list(output.keys()) == expected_keys)

print("\nLab5_output.npy generated and sanity check passed.")
### END CODE HERE ###