In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:

# Load and preprocess data
fashion_mnist = keras.datasets.fashion_mnist
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# Use more data for better training
X_train = X_train / 255.0
X_test = X_test / 255.0
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)

class Dense_layer:
    def __init__(self, input_size, no_of_neurons, l2_lambda=0.0001):
        self.W = np.random.randn(input_size, no_of_neurons) * np.sqrt(1.0 / input_size)
        self.b = np.zeros((1, no_of_neurons))
        self.l2_lambda = l2_lambda  
        
    def forward(self, X):
        self.output = np.dot(X, self.W) + self.b
        self.input = X
        
    def backward(self, dl_dw):
        # Regular gradient
        self.dweights = np.dot(self.input.T, dl_dw)
        self.dbiases = np.sum(dl_dw, axis=0, keepdims=True)
        
        # Add L2 gradient
        self.dweights += self.l2_lambda * self.W
        
        # Clip gradients to prevent explosion
        self.dweights = np.clip(self.dweights, -1.0, 1.0)
        self.dbiases = np.clip(self.dbiases, -1.0, 1.0)
        
        self.dinputs = np.dot(dl_dw, self.W.T)

    def get_l2_loss(self):
        return self.l2_lambda * np.sum(self.W ** 2)

class Activation_ReLU:
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        self.inputs = inputs
        
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0
        self.dinputs = np.clip(self.dinputs, -1.0, 1.0)

class Activation_softmax:
    def forward(self, inputs):
        # More numerically stable softmax
        inputs = np.clip(inputs, -500, 500)  # Prevent extreme values
        exp = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        self.output = exp / (np.sum(exp, axis=1, keepdims=True) + 1e-10)

class CategoricalCrossentropy:
    def forward(self, y_true, y_predicted):
        # Better clipping
        y_predicted = np.clip(y_predicted, 1e-10, 1.0)
        
        if len(y_true.shape) == 1:
            log_probs = -np.log(y_predicted[range(len(y_true)), y_true])
            log_probs = np.clip(log_probs, 0, 100)
            loss = np.mean(log_probs)
        elif len(y_true.shape) == 2:
            A = np.sum(y_true * y_predicted, axis=1)
            A = np.clip(A, 1e-10, 1.0)
            log_probs = -np.log(A)
            log_probs = np.clip(log_probs, 0, 100)
            loss = np.mean(log_probs)
        return loss
        
class Activation_Softmax_Loss_CategoricalCrossentropy:
    def __init__(self):
        self.activation = Activation_softmax()
        self.loss = CategoricalCrossentropy()
        self.output = None  # Initialize output
    
    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.forward(y_true, self.output)
        
    def backward(self, y_true):
        samples = len(self.output)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        
        self.dinputs = self.output.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples
        
        # Clip gradients
        self.dinputs = np.clip(self.dinputs, -1.0, 1.0)

def accuracy(y_predict, y_true):

    if y_predict is None:
        return 0.0
    
    if len(y_true.shape) == 2:
        y_true = np.argmax(y_true, axis=1)
    

    if np.any(np.isnan(y_predict)):
        return 0.0
    
    
    min_len = min(len(y_predict), len(y_true))
    y_predict = y_predict[:min_len]
    y_true = y_true[:min_len]
    
    predictions = np.argmax(y_predict, axis=1)
    return np.mean(predictions == y_true)

class optimizer_Adam:
    def __init__(self, learning_rate=0.001, decay=1e-4, epsilon=1e-8, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
            
    def update_params(self, layer):
        if not hasattr(layer, 'weight_momentums'):
            layer.weight_momentums = np.zeros_like(layer.W)
            layer.weight_cache = np.zeros_like(layer.W)
            layer.bias_momentums = np.zeros_like(layer.b)
            layer.bias_cache = np.zeros_like(layer.b)
        
        # Check for NaN in gradients
        if np.any(np.isnan(layer.dweights)) or np.any(np.isnan(layer.dbiases)):
            print("Warning: NaN detected in gradients, skipping update")
            return
            
        # Update momentum
        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases

        # Bias correction
        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))

        # Update cache
        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * (layer.dweights) ** 2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * (layer.dbiases) ** 2

        # Bias correction for cache
        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))

        # Update parameters
        weight_update = self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        bias_update = self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
        
        # Clip updates
        weight_update = np.clip(weight_update, -0.1, 0.1)
        bias_update = np.clip(bias_update, -0.1, 0.1)
        
        layer.W -= weight_update
        layer.b -= bias_update
        
    def post_update_params(self):
        self.iterations += 1

class Layer_Dropout:
    def __init__(self, rate=0.1):
        self.rate = rate
        self.mask = None
        
    def forward(self, inputs, training=True):
        self.inputs = inputs
        if training:
            self.mask = np.random.binomial(1, 1 - self.rate, size=inputs.shape) / (1 - self.rate + 1e-10)
            self.output = inputs * self.mask
        else:
            self.output = inputs
            self.mask = None
            
    def backward(self, dvalues):
        if self.mask is not None:
            self.dinputs = dvalues * self.mask
        else:
            self.dinputs = dvalues.copy()

# Build model
print("Building model...")

# Layer 1: 784 -> 128
layer1 = Dense_layer(784, 128, l2_lambda=0.00001)
activation_relu_1 = Activation_ReLU()
dropout1 = Layer_Dropout(rate=0.1)

# Layer 2: 128 -> 64
layer2 = Dense_layer(128, 64, l2_lambda=0.00001)
activation_relu_2 = Activation_ReLU()
dropout2 = Layer_Dropout(rate=0.1)

# Output layer: 64 -> 10
layer3 = Dense_layer(64, 10, l2_lambda=0.000001)

loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()

all_layers = [layer1, layer2, layer3]

# Optimizer
optimizer = optimizer_Adam(learning_rate=0.0005, decay=1e-4, epsilon=1e-8)

epochs = 300
batch_size = 64
n_batches = X_train.shape[0] // batch_size

# For tracking progress
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []
l2_losses = []

# Split training data
val_split = 0.1
val_size = int(X_train.shape[0] * val_split)
X_val = X_train[:val_size]
y_val = y_train[:val_size]
X_train_small = X_train[val_size:]
y_train_small = y_train[val_size:]

best_val_accuracy = 0
patience = 20
patience_counter = 0
best_weights = None

print(f"Training samples: {X_train_small.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print("Starting training...")

for epoch in range(epochs):
    # Shuffle data
    indices = np.random.permutation(X_train_small.shape[0])
    X_shuffled = X_train_small[indices]
    y_shuffled = y_train_small[indices]

    epoch_loss = 0
    epoch_accuracy = 0
    epoch_l2_loss = 0
    batch_count = 0

    # Training
    for batch in range(n_batches):
        start_idx = batch * batch_size
        end_idx = min((batch + 1) * batch_size, X_shuffled.shape[0])
        X_batch = X_shuffled[start_idx:end_idx]
        y_batch = y_shuffled[start_idx:end_idx]
        
        # Skip if batch is empty
        if len(X_batch) == 0:
            continue

        # Forward pass
        layer1.forward(X_batch)
        activation_relu_1.forward(layer1.output)
        dropout1.forward(activation_relu_1.output, training=True)

        layer2.forward(dropout1.output)
        activation_relu_2.forward(layer2.output)
        dropout2.forward(activation_relu_2.output, training=True)

        layer3.forward(dropout2.output)

        # Calculate losses
        ce_loss = loss_activation.forward(layer3.output, y_batch)
        l2_loss = sum(layer.get_l2_loss() for layer in all_layers)
        
        # Check for NaN
        if np.isnan(ce_loss):
            print(f"Warning: NaN loss at epoch {epoch}, batch {batch}")
            continue

        # Backward pass
        loss_activation.backward(y_batch)
        
        layer3.backward(loss_activation.dinputs)
        
        dropout2.backward(layer3.dinputs)
        activation_relu_2.backward(dropout2.dinputs)
        layer2.backward(activation_relu_2.dinputs)
        
        dropout1.backward(layer2.dinputs)
        activation_relu_1.backward(dropout1.dinputs)
        layer1.backward(activation_relu_1.dinputs)

        # Update parameters
        optimizer.pre_update_params()
        optimizer.update_params(layer1)
        optimizer.update_params(layer2)
        optimizer.update_params(layer3)
        optimizer.post_update_params()

        # Calculate accuracy for this batch
        batch_accuracy = accuracy(loss_activation.output, y_batch)
        
        # Check for NaN in accuracy
        if not np.isnan(batch_accuracy):
            epoch_accuracy += batch_accuracy
            epoch_loss += ce_loss
            epoch_l2_loss += l2_loss
            batch_count += 1

    if batch_count == 0:
        print(f"Epoch {epoch}: All batches resulted in NaN, stopping training")
        break

    # Average training metrics
    avg_train_loss = epoch_loss / batch_count
    avg_train_accuracy = epoch_accuracy / batch_count
    avg_l2_loss = epoch_l2_loss / batch_count
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)
    l2_losses.append(avg_l2_loss)

    # Validation
    layer1.forward(X_val)
    activation_relu_1.forward(layer1.output)
    dropout1.forward(activation_relu_1.output, training=False)

    layer2.forward(dropout1.output)
    activation_relu_2.forward(layer2.output)
    dropout2.forward(activation_relu_2.output, training=False)

    layer3.forward(dropout2.output)
    val_loss = loss_activation.forward(layer3.output, y_val)
    val_accuracy = accuracy(loss_activation.output, y_val)

    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)


    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        patience_counter = 0
        
    else:
        patience_counter += 1

    if epoch % 25 == 0:
        print(f'Epoch {epoch:3d}: train_acc = {avg_train_accuracy:.4f}, val_acc = {val_accuracy:.4f}')

    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break



# FIXED: Run forward pass on test data before calculating accuracy
print("\n" + "="*50)
print("Evaluating on test set...")

# Run forward pass on test data
layer1.forward(X_test)
activation_relu_1.forward(layer1.output)
dropout1.forward(activation_relu_1.output, training=False)

layer2.forward(dropout1.output)
activation_relu_2.forward(layer2.output)
dropout2.forward(activation_relu_2.output, training=False)

layer3.forward(dropout2.output)

test_predictions = loss_activation.forward(layer3.output, y_test)
test_accuracy = accuracy(layer3.output, y_test)

print(f'Test accuracy: {test_accuracy:.4f}')

# Plot results
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.title('Loss Over Time')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(val_accuracies, label='Val Accuracy')
plt.title('Accuracy Over Time')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 3)
if len(train_accuracies) > 0:
    plt.bar(['Train', 'Val', 'Test'], 
            [train_accuracies[-1], best_val_accuracy, test_accuracy])
plt.title('Final Accuracies')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

print(f"\nFinal Results:")
print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")