In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import mnist  
import math
from numpy import random

def sigmoid(x):
    return 1/(1+np.exp(-x))

def derivative_sigmoid(x):
    return np.exp(x)/(1+np.exp(x))**2

def softmax(x):
    total = 0
    output = [0.0]*len(x)
    for index in range(len(x)):
        output[index] = np.exp(x[index])
        total = total + output[index]
    output = output/total
    return output
def derivative_softmax(x):
    return np.multiply(softmax(x), [1]*len(x) - softmax(x))

def ReLu(x):
    for i in range(len(x)):
        for j in range(len(x[i])):
            if(x[i][j] < 0 or x[i][j] == 0):
                x[i][j] = 0
    return x
def derivative_ReLu(x):
    for i in range(len(x)):
        for j in range(len(x[i])):
            if(x[i][j] < 0 or x[i][j] == 0):
                x[i][j] = 0
            else:
                x[i][j] = 1
    return x

def LeakyReLu(x, a):
    y = np.zeros_like(x)
    for i in range(len(x)):
        for j in range(len(x[i])):
            if(x[i][j] < 0 or x[i][j] == 0):
                y[i][j] = a * x[i][j] 
            elif(x[i][j] > 0):
                y[i][j] = x[i][j]
    return y

def derivative_LeakyReLu(x, a):
    y = np.zeros_like(x)
    for i in range(len(x)):
        for j in range(len(x[i])):
            if(x[i][j] < 0 or x[i][j] == 0):
                y[i][j] = a
            else:
                y[i][j] = 1 
    return y

def Batch_Normalization(activation_pre, theta, beta):
    activation_post = []
    activation_hat = []
    mu = np.mean(activation_pre, axis=0)
    sigma_2 = np.var(activation_pre, axis=0)
    for i in range(len(activation_pre)):
        activation_hat.append((activation_pre[i] - mu)/np.sqrt(sigma_2 + 10e-8))
        activation_post.append(np.multiply(theta, (activation_pre[i] - mu)/np.sqrt(sigma_2 + 10e-8)) + beta)
    return (np.array(activation_post), np.array(activation_hat), mu, sigma_2)

def Batch_Normalization_Derivative(dL_activation_post, activation_hat, activation_pre, mu, sigma_2, theta, beta, getLinearDerivatives):
    if(getLinearDerivatives == True):
        dL_theta = np.zeros_like(theta)
        for i in range(Batch_Size):
            dL_theta = dL_theta + np.multiply(dL_activation_post[i], activation_hat[i])
        dL_beta = np.sum(dL_activation_post, axis=0)
        dL_activation_hat = []
        for j in range(Batch_Size):
            dL_activation_hat.append(np.multiply(dL_activation_post[j], theta))
        dL_mu = np.zeros_like(mu)
        for k in range(Batch_Size):
            dL_mu = dL_mu + np.multiply(dL_activation_hat[k], -1/np.sqrt(sigma_2 + 10e-8))
        dL_sigma_2 = np.zeros_like(sigma_2)
        for l in range(Batch_Size):
            dL_sigma_2 = dL_sigma_2 + np.multiply(dL_activation_hat[l], -1/2*(activation_pre[l] - mu)/(np.sqrt(sigma_2 + 10e-8))**3)
        dL_activation_pre = []
        for m in range(Batch_Size):
            dL_activation_pre.append(np.multiply(dL_activation_hat[m], 1/np.sqrt(sigma_2 + 10e-8)) + 1/Batch_Size * dL_mu + np.multiply(dL_sigma_2, 2/Batch_Size * (activation_pre[m] - mu)))
        return (dL_activation_pre, dL_sigma_2, dL_beta)
    elif(getLinearDerivatives == False):
        dL_activation_hat = []
        for j in range(Batch_Size):
            dL_activation_hat.append(np.multiply(dL_activation_post[j], theta))
        dL_mu = np.zeros_like(mu)
        for k in range(Batch_Size):
            dL_mu = dL_mu + np.multiply(dL_activation_hat[k], -1/np.sqrt(sigma_2 + 10e-8))
        dL_sigma_2 = np.zeros_like(sigma_2)
        for l in range(Batch_Size):
            dL_sigma_2 = dL_sigma_2 + np.multiply(dL_activation_hat[l], -1/2*(activation_pre[l] - mu)/(np.sqrt(sigma_2 + 10e-8))**3)
        dL_activation_pre = []
        for m in range(Batch_Size):
            dL_activation_pre.append(np.multiply(dL_activation_hat[m], 1/np.sqrt(sigma_2 + 10e-8)) + 1/Batch_Size * dL_mu + np.multiply(dL_sigma_2, 2/Batch_Size * (activation_pre[m] - mu)))
        return dL_activation_pre

def test_Batch_Normalization(activation_pre, true_mu, true_sigma_2, theta, beta):
    activation_post = []
    for i in range(len(activation_pre)):
        activation_post.append(np.multiply(theta, (activation_pre[i] - true_mu)/np.sqrt(true_sigma_2 + 10e-8)) + beta)
    return activation_post 

def Spectral_Normalization(matrix):
    u, s, vh = np.linalg.svd(matrix, full_matrices=True)
    return matrix/s[0]

(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

Generation_Training_set = []

for i in range(len(X_train)):
    if(Y_train[i] == 5):
        Generation_Training_set.append(X_train[i])
for i in range(len(X_test)):
    if(Y_test[i] == 5):
        Generation_Training_set.append(X_test[i])

Batch_Size = 48

Generation_Training_set = Generation_Training_set[0:6000]
Generation_Training_set = np.array(Generation_Training_set).reshape(int(6000/Batch_Size), Batch_Size, 784)

##Generator Weights/Biases here 

Weights_0_to_1_G = np.random.normal(0, math.sqrt(2/100), 157*100).reshape(157, 100)
theta_1_G = np.ones(157)
beta_1_G = np.zeros(157)

Weights_1_to_2_G = np.random.normal(0, math.sqrt(2/157), 314*157).reshape(314, 157)
theta_2_G = np.ones(314)
beta_2_G = np.zeros(314)

Weights_2_to_3_G = np.random.normal(0, math.sqrt(2/314), 471*314).reshape(471, 314)
theta_3_G = np.ones(471)
beta_3_G = np.zeros(471)

Weights_3_to_4_G = np.random.normal(0, math.sqrt(2/471), 628*471).reshape(628, 471)
theta_4_G = np.ones(628)
beta_4_G = np.zeros(628)

Weights_4_to_5_G = np.random.normal(0, math.sqrt(2/628), 784*628).reshape(784, 628)
bias_5_G = np.random.normal(0, math.sqrt(2/784), 784)

##Discriminator Weights/Biases here

Weights_0_to_1_D = np.random.normal(0, math.sqrt(2/784), 628*784).reshape(628, 784)
theta_1_D = np.ones(628)
beta_1_D = np.zeros(628)

Weights_1_to_2_D = np.random.normal(0, math.sqrt(2/628), 471*628).reshape(471, 628)
theta_2_D = np.ones(471)
beta_2_D = np.zeros(471)

Weights_2_to_3_D = np.random.normal(0, math.sqrt(2/471), 314*471).reshape(314, 471)
theta_3_D = np.ones(314)
beta_3_D = np.zeros(314)

Weights_3_to_4_D = np.random.normal(0, math.sqrt(2/314), 157*314).reshape(157, 314)
theta_4_D = np.ones(157)
beta_4_D = np.zeros(157)

Weights_4_to_5_D = np.random.normal(0, math.sqrt(2/157), 157)
bias_5_D = np.random.normal(0, math.sqrt(2/157))

##Adam matrices for Generator 

Weights_0_to_1_G_m_matrix = np.zeros_like(Weights_0_to_1_G)
Weights_0_to_1_G_v_matrix = np.zeros_like(Weights_0_to_1_G)
theta_1_G_m_matrix = np.zeros_like(theta_1_G)
theta_1_G_v_matrix = np.zeros_like(theta_1_G)
beta_1_G_m_matrix = np.zeros_like(beta_1_G)
beta_1_G_v_matrix = np.zeros_like(beta_1_G)

Weights_1_to_2_G_m_matrix = np.zeros_like(Weights_1_to_2_G)
Weights_1_to_2_G_v_matrix = np.zeros_like(Weights_1_to_2_G)
theta_2_G_m_matrix = np.zeros_like(theta_2_G)
theta_2_G_v_matrix = np.zeros_like(theta_2_G)
beta_2_G_m_matrix = np.zeros_like(beta_2_G)
beta_2_G_v_matrix = np.zeros_like(beta_2_G)

Weights_2_to_3_G_m_matrix = np.zeros_like(Weights_2_to_3_G)
Weights_2_to_3_G_v_matrix = np.zeros_like(Weights_2_to_3_G)
theta_3_G_m_matrix = np.zeros_like(theta_3_G)
theta_3_G_v_matrix = np.zeros_like(theta_3_G)
beta_3_G_m_matrix = np.zeros_like(beta_3_G)
beta_3_G_v_matrix = np.zeros_like(beta_3_G)

Weights_3_to_4_G_m_matrix = np.zeros_like(Weights_3_to_4_G)
Weights_3_to_4_G_v_matrix = np.zeros_like(Weights_3_to_4_G)
theta_4_G_m_matrix = np.zeros_like(theta_4_G)
theta_4_G_v_matrix = np.zeros_like(theta_4_G)
beta_4_G_m_matrix = np.zeros_like(beta_4_G)
beta_4_G_v_matrix = np.zeros_like(beta_4_G)

Weights_4_to_5_G_m_matrix = np.zeros_like(Weights_4_to_5_G)
Weights_4_to_5_G_v_matrix = np.zeros_like(Weights_4_to_5_G)
bias_5_G_m_matrix = np.zeros_like(bias_5_G)
bias_5_G_v_matrix = np.zeros_like(bias_5_G)

##Adam matrices for Discriminator 

Weights_0_to_1_D_m_matrix = np.zeros_like(Weights_0_to_1_D)
Weights_0_to_1_D_v_matrix = np.zeros_like(Weights_0_to_1_D)
theta_1_D_m_matrix = np.zeros_like(theta_1_D)
theta_1_D_v_matrix = np.zeros_like(theta_1_D)
beta_1_D_m_matrix = np.zeros_like(beta_1_D)
beta_1_D_v_matrix = np.zeros_like(beta_1_D)

Weights_1_to_2_D_m_matrix = np.zeros_like(Weights_1_to_2_D)
Weights_1_to_2_D_v_matrix = np.zeros_like(Weights_1_to_2_D)
theta_2_D_m_matrix = np.zeros_like(theta_2_D)
theta_2_D_v_matrix = np.zeros_like(theta_2_D)
beta_2_D_m_matrix = np.zeros_like(beta_2_D)
beta_2_D_v_matrix = np.zeros_like(beta_2_D)

Weights_2_to_3_D_m_matrix = np.zeros_like(Weights_2_to_3_D)
Weights_2_to_3_D_v_matrix = np.zeros_like(Weights_2_to_3_D)
theta_3_D_m_matrix = np.zeros_like(theta_3_D)
theta_3_D_v_matrix = np.zeros_like(theta_3_D)
beta_3_D_m_matrix = np.zeros_like(beta_3_D)
beta_3_D_v_matrix = np.zeros_like(beta_3_D)

Weights_3_to_4_D_m_matrix = np.zeros_like(Weights_3_to_4_D)
Weights_3_to_4_D_v_matrix = np.zeros_like(Weights_3_to_4_D)
theta_4_D_m_matrix = np.zeros_like(theta_4_D)
theta_4_D_v_matrix = np.zeros_like(theta_4_D)
beta_4_D_m_matrix = np.zeros_like(beta_4_D)
beta_4_D_v_matrix = np.zeros_like(beta_4_D)

Weights_4_to_5_D_m_matrix = np.zeros_like(Weights_4_to_5_D)
Weights_4_to_5_D_v_matrix = np.zeros_like(Weights_4_to_5_D)
bias_5_D_m_matrix = np.zeros_like(bias_5_D)
bias_5_D_v_matrix = np.zeros_like(bias_5_D)

mu_1_G_true = np.zeros(157)
sigma_2_1_G_true = np.zeros(157)
mu_2_G_true = np.zeros(314)
sigma_2_2_G_true = np.zeros(314)
mu_3_G_true = np.zeros(471)
sigma_2_3_G_true = np.zeros(471)
mu_4_G_true = np.zeros(628)
sigma_2_4_G_true = np.zeros(628)

mu_1_D_G_true = np.zeros(628)
sigma_2_1_D_G_true = np.zeros(628)
mu_2_D_G_true = np.zeros(471)
sigma_2_2_D_G_true = np.zeros(471)
mu_3_D_G_true = np.zeros(314)
sigma_2_3_D_G_true = np.zeros(314)
mu_4_D_G_true = np.zeros(157)
sigma_2_4_D_G_true = np.zeros(157)

for e in range(0, 16):
    
    Generation_Training_set = Generation_Training_set.reshape(6000, 784)
    np.random.shuffle(Generation_Training_set)
    Generation_Training_set = Generation_Training_set.reshape(int(6000/Batch_Size), Batch_Size, 784)

    for r in range(1, len(Generation_Training_set) + 1):

        latent_space_random_vector = np.random.normal(0, 1, 100*Batch_Size).reshape(Batch_Size, 100)

        ##We generate our 5's here

        z_1_G_pre = [] 
        for i in range(Batch_Size):
            z_1_G_pre.append(np.matmul(Weights_0_to_1_G, latent_space_random_vector[i]))
        activation_1_G_pre = LeakyReLu(z_1_G_pre, 0.2)
        (activation_1_G_post, activation_1_G_hat, mu_1_G, sigma_2_1_G) = Batch_Normalization(activation_1_G_pre, theta_1_G, beta_1_G)

        mu_1_G_true = 0.1 * mu_1_G + (1-0.1) * mu_1_G_true
        sigma_2_1_G_true = 0.1 * sigma_2_1_G + (1-0.1) * sigma_2_1_G_true

        z_2_G_pre = []
        for j in range(Batch_Size):
            z_2_G_pre.append(np.matmul(Weights_1_to_2_G, activation_1_G_post[j]))
        activation_2_G_pre = LeakyReLu(z_2_G_pre, 0.2)
        (activation_2_G_post, activation_2_G_hat, mu_2_G, sigma_2_2_G) = Batch_Normalization(activation_2_G_pre, theta_2_G, beta_2_G)

        mu_2_G_true = 0.1 * mu_2_G + (1-0.1) * mu_2_G_true
        sigma_2_2_G_true = 0.1 * sigma_2_2_G + (1-0.1) * sigma_2_2_G_true

        z_3_G_pre = []
        for k in range(Batch_Size):
            z_3_G_pre.append(np.matmul(Weights_2_to_3_G, activation_2_G_post[k]))
        activation_3_G_pre = LeakyReLu(z_3_G_pre, 0.2)
        (activation_3_G_post, activation_3_G_hat, mu_3_G, sigma_2_3_G) = Batch_Normalization(activation_3_G_pre, theta_3_G, beta_3_G)

        mu_3_G_true = 0.1 * mu_3_G + (1-0.1) * mu_3_G_true
        sigma_2_3_G_true = 0.1 * sigma_2_3_G + (1-0.1) * sigma_2_3_G_true

        z_4_G_pre = []
        for k in range(Batch_Size):
            z_4_G_pre.append(np.matmul(Weights_3_to_4_G, activation_3_G_post[k]))
        activation_4_G_pre = LeakyReLu(z_4_G_pre, 0.2)
        (activation_4_G_post, activation_4_G_hat, mu_4_G, sigma_2_4_G) = Batch_Normalization(activation_4_G_pre, theta_4_G, beta_4_G)

        mu_4_G_true = 0.1 * mu_4_G + (1-0.1) * mu_4_G_true
        sigma_2_4_G_true = 0.1 * sigma_2_4_G + (1-0.1) * sigma_2_4_G_true

        z_5_G = []
        for l in range(Batch_Size):
            z_5_G.append(np.matmul(Weights_4_to_5_G, activation_4_G_post[l]) + bias_5_G)
        activation_5_G = np.tanh(z_5_G) 

        noise = np.random.normal(0, 1 - (r-1)/(len(Generation_Training_set) - 1), 784*Batch_Size).reshape(Batch_Size, 784)
        bruh = np.random.binomial(1, 1/2, 1)

        if(bruh == 0):
            activation_5_G = 128.5 * (activation_5_G + np.ones_like(activation_5_G)) - np.ones_like(activation_5_G) + noise
            activation_5_G = (activation_5_G + np.ones_like(activation_5_G))/128.5 - np.ones_like(activation_5_G)  
        else:
            Transformed_Generation_Training_Set = (Generation_Training_set[r-1] + noise + np.ones_like(Generation_Training_set[r-1]))/128.5 - np.ones_like(Generation_Training_set[r-1])

        ##We feed our Generated 5's through the discriminator here 

        z_1_D_G_pre = []
        for i in range(Batch_Size):
            z_1_D_G_pre.append(np.matmul(Weights_0_to_1_D, activation_5_G[i]))
        activation_1_D_G_pre = LeakyReLu(z_1_D_G_pre, 0.2)
        (activation_1_D_G_post, activation_1_D_G_hat, mu_1_D_G, sigma_2_1_D_G) = Batch_Normalization(activation_1_D_G_pre, theta_1_D, beta_1_D)

        mu_1_D_G_true = 0.1 * mu_1_D_G + (1-0.1) * mu_1_D_G_true
        sigma_2_1_D_G_true = 0.1 * sigma_2_1_D_G + (1-0.1) * sigma_2_1_D_G_true

        z_2_D_G_pre = []
        for j in range(Batch_Size):
            z_2_D_G_pre.append(np.matmul(Weights_1_to_2_D, activation_1_D_G_post[j]))
        activation_2_D_G_pre = LeakyReLu(z_2_D_G_pre, 0.2)
        (activation_2_D_G_post, activation_2_D_G_hat, mu_2_D_G, sigma_2_2_D_G) = Batch_Normalization(activation_2_D_G_pre, theta_2_D, beta_2_D)

        mu_2_D_G_true = 0.1 * mu_2_D_G + (1-0.1) * mu_2_D_G_true
        sigma_2_2_D_G_true = 0.1 * sigma_2_2_D_G + (1-0.1) * sigma_2_2_D_G_true

        z_3_D_G_pre = []
        for k in range(Batch_Size):
            z_3_D_G_pre.append(np.matmul(Weights_2_to_3_D, activation_2_D_G_post[k]))
        activation_3_D_G_pre = LeakyReLu(z_3_D_G_pre, 0.2)
        (activation_3_D_G_post, activation_3_D_G_hat, mu_3_D_G, sigma_2_3_D_G) = Batch_Normalization(activation_3_D_G_pre, theta_3_D, beta_3_D)

        mu_3_D_G_true = 0.1 * mu_3_D_G + (1-0.1) * mu_3_D_G_true
        sigma_2_3_D_G_true = 0.1 * sigma_2_3_D_G + (1-0.1) * sigma_2_3_D_G_true  

        z_4_D_G_pre = []
        for k in range(Batch_Size):
            z_4_D_G_pre.append(np.matmul(Weights_3_to_4_D, activation_3_D_G_post[k]))
        activation_4_D_G_pre = LeakyReLu(z_4_D_G_pre, 0.2)
        (activation_4_D_G_post, activation_4_D_G_hat, mu_4_D_G, sigma_2_4_D_G) = Batch_Normalization(activation_4_D_G_pre, theta_4_D, beta_4_D)

        mu_4_D_G_true = 0.1 * mu_4_D_G + (1-0.1) * mu_4_D_G_true
        sigma_2_4_D_G_true = 0.1 * sigma_2_4_D_G + (1-0.1) * sigma_2_4_D_G_true  

        z_5_D_G = []
        for l in range(Batch_Size):
            z_5_D_G.append(np.dot(Weights_4_to_5_D, activation_4_D_G_post[l]) + bias_5_D)
        activation_5_D_G = sigmoid(np.array(z_5_D_G))

        ##We feed our sampled 5's through the discriminator here 

        z_1_D_S_pre = []
        for i in range(Batch_Size):
            z_1_D_S_pre.append(np.matmul(Weights_0_to_1_D, Transformed_Generation_Training_Set[i]))
        activation_1_D_S_pre = LeakyReLu(z_1_D_S_pre, 0.2)
        (activation_1_D_S_post, activation_1_D_S_hat, mu_1_D_S, sigma_2_1_D_S) = Batch_Normalization(activation_1_D_S_pre, theta_1_D, beta_1_D)

        z_2_D_S_pre = []
        for j in range(Batch_Size):
            z_2_D_S_pre.append(np.matmul(Weights_1_to_2_D, activation_1_D_S_post[j]))
        activation_2_D_S_pre = LeakyReLu(z_2_D_S_pre, 0.2)
        (activation_2_D_S_post, activation_2_D_S_hat, mu_2_D_S, sigma_2_2_D_S) = Batch_Normalization(activation_2_D_S_pre, theta_2_D, beta_2_D)

        z_3_D_S_pre = []
        for k in range(Batch_Size):
            z_3_D_S_pre.append(np.matmul(Weights_2_to_3_D, activation_2_D_S_post[k]))
        activation_3_D_S_pre = LeakyReLu(z_3_D_S_pre, 0.2)
        (activation_3_D_S_post, activation_3_D_S_hat, mu_3_D_S, sigma_2_3_D_S) = Batch_Normalization(activation_3_D_S_pre, theta_3_D, beta_3_D)

        z_4_D_S_pre = []
        for k in range(Batch_Size):
            z_4_D_S_pre.append(np.matmul(Weights_3_to_4_D, activation_3_D_S_post[k]))
        activation_4_D_S_pre = LeakyReLu(z_4_D_S_pre, 0.2)
        (activation_4_D_S_post, activation_4_D_S_hat, mu_4_D_S, sigma_2_4_D_S) = Batch_Normalization(activation_4_D_S_pre, theta_4_D, beta_4_D)

        z_5_D_S = []
        for l in range(Batch_Size):
            z_5_D_S.append(np.dot(Weights_4_to_5_D, activation_4_D_S_post[l]) + bias_5_D)
        activation_5_D_S = sigmoid(np.array(z_5_D_S))

        ##We start Backpropagating through the Discriminator's Jensen-Shannon Divergence here 

        dJ_D_activation_5_D_S = [0.0] * Batch_Size 
        dJ_D_activation_5_D_G = [0.0] * Batch_Size 

        for i in range(Batch_Size):
            dJ_D_activation_5_D_S[i] = -1/(Batch_Size * activation_5_D_S[i])
            dJ_D_activation_5_D_G[i] = 1/(Batch_Size * (1-activation_5_D_G[i]))

        delta_5_D_S = np.multiply(dJ_D_activation_5_D_S, derivative_sigmoid(z_5_D_S))
        delta_5_D_G = np.multiply(dJ_D_activation_5_D_G, derivative_sigmoid(z_5_D_G))

        dJ_D_Weights_4_to_5_D = []

        for j in range(len(Weights_4_to_5_D)):
            dJ_D_Weights_4_to_5_D.append(np.dot(delta_5_D_S, activation_4_D_S_post[:,j]) + np.dot(delta_5_D_G, activation_4_D_G_post[:,j]))

        dJ_D_bias_4_to_5_D = np.sum(delta_5_D_G) + np.sum(delta_5_D_S)

        dJ_D_activation_4_D_S_post = np.matmul(delta_5_D_S.reshape(Batch_Size, 1), Weights_4_to_5_D.reshape(157, 1).T)
        dJ_D_activation_4_D_G_post = np.matmul(delta_5_D_G.reshape(Batch_Size, 1), Weights_4_to_5_D.reshape(157, 1).T)

        (dJ_D_activation_4_D_S_pre, dJ_D_theta_4_D_in_S, dJ_D_beta_4_D_in_S) = Batch_Normalization_Derivative(dJ_D_activation_4_D_S_post, activation_4_D_S_hat, activation_4_D_S_pre, mu_4_D_S, sigma_2_4_D_S, theta_4_D, beta_4_D, True)
        (dJ_D_activation_4_D_G_pre, dJ_D_theta_4_D_in_G, dJ_D_beta_4_D_in_G) = Batch_Normalization_Derivative(dJ_D_activation_4_D_G_post, activation_4_D_G_hat, activation_4_D_G_pre, mu_4_D_G, sigma_2_4_D_G, theta_4_D, beta_4_D, True)

        delta_4_D_S = np.multiply(dJ_D_activation_4_D_S_pre, derivative_LeakyReLu(z_4_D_S_pre, 0.2))
        delta_4_D_G = np.multiply(dJ_D_activation_4_D_G_pre, derivative_LeakyReLu(z_4_D_G_pre, 0.2))

        dJ_D_Weights_3_to_4_D = np.zeros_like(Weights_3_to_4_D)

        for j in range(len(Weights_3_to_4_D)):
            for k in range(len(Weights_3_to_4_D[j])):
                dJ_D_Weights_3_to_4_D[j][k] = np.dot(delta_4_D_S[:,j], activation_3_D_S_post[:,k]) + np.dot(delta_4_D_G[:,j], activation_3_D_G_post[:,k])

        dJ_D_activation_3_D_S_post = np.zeros_like(activation_3_D_S_post)

        for i in range(Batch_Size):
            for k in range(len(activation_3_D_S_post[i])):
                dJ_D_activation_3_D_S_post[i][k] = np.dot(delta_4_D_S[i], Weights_3_to_4_D[:,k])

        dJ_D_activation_3_D_G_post = np.zeros_like(activation_3_D_G_post)

        for i in range(Batch_Size):
            for k in range(len(activation_3_D_G_post[i])):
                dJ_D_activation_3_D_G_post[i][k] = np.dot(delta_4_D_G[i], Weights_3_to_4_D[:,k])

        (dJ_D_activation_3_D_S_pre, dJ_D_theta_3_D_in_S, dJ_D_beta_3_D_in_S) = Batch_Normalization_Derivative(dJ_D_activation_3_D_S_post, activation_3_D_S_hat, activation_3_D_S_pre, mu_3_D_S, sigma_2_3_D_S, theta_3_D, beta_3_D, True)
        (dJ_D_activation_3_D_G_pre, dJ_D_theta_3_D_in_G, dJ_D_beta_3_D_in_G) = Batch_Normalization_Derivative(dJ_D_activation_3_D_G_post, activation_3_D_G_hat, activation_3_D_G_pre, mu_3_D_G, sigma_2_3_D_G, theta_3_D, beta_3_D, True)
        
        delta_3_D_S = np.multiply(dJ_D_activation_3_D_S_pre, derivative_LeakyReLu(z_3_D_S_pre, 0.2))
        delta_3_D_G = np.multiply(dJ_D_activation_3_D_G_pre, derivative_LeakyReLu(z_3_D_G_pre, 0.2))

        dJ_D_Weights_2_to_3_D = np.zeros_like(Weights_2_to_3_D)

        for j in range(len(Weights_2_to_3_D)):
            for k in range(len(Weights_2_to_3_D[j])):
                dJ_D_Weights_2_to_3_D[j][k] = np.dot(delta_3_D_S[:,j], activation_2_D_S_post[:,k]) + np.dot(delta_3_D_G[:,j], activation_2_D_G_post[:,k])

        dJ_D_activation_2_D_S_post = np.zeros_like(activation_2_D_S_post)

        for i in range(Batch_Size):
            for k in range(len(activation_2_D_S_post[i])):
                dJ_D_activation_2_D_S_post[i][k] = np.dot(delta_3_D_S[i], Weights_2_to_3_D[:,k])

        dJ_D_activation_2_D_G_post = np.zeros_like(activation_2_D_G_post)

        for i in range(Batch_Size):
            for k in range(len(activation_2_D_G_post[i])):
                dJ_D_activation_2_D_G_post[i][k] = np.dot(delta_3_D_G[i], Weights_2_to_3_D[:,k])

        (dJ_D_activation_2_D_S_pre, dJ_D_theta_2_D_in_S, dJ_D_beta_2_D_in_S) = Batch_Normalization_Derivative(dJ_D_activation_2_D_S_post, activation_2_D_S_hat, activation_2_D_S_pre, mu_2_D_S, sigma_2_2_D_S, theta_2_D, beta_2_D, True)
        (dJ_D_activation_2_D_G_pre, dJ_D_theta_2_D_in_G, dJ_D_beta_2_D_in_G) = Batch_Normalization_Derivative(dJ_D_activation_2_D_G_post, activation_2_D_G_hat, activation_2_D_G_pre, mu_2_D_G, sigma_2_2_D_G, theta_2_D, beta_2_D, True)

        delta_2_D_S = np.multiply(dJ_D_activation_2_D_S_pre, derivative_LeakyReLu(z_2_D_S_pre, 0.2))
        delta_2_D_G = np.multiply(dJ_D_activation_2_D_G_pre, derivative_LeakyReLu(z_2_D_G_pre, 0.2))

        dJ_D_Weights_1_to_2_D = np.zeros_like(Weights_1_to_2_D)

        for j in range(len(Weights_1_to_2_D)):
            for k in range(len(Weights_1_to_2_D[j])):
                dJ_D_Weights_1_to_2_D[j][k] = np.dot(delta_2_D_S[:,j], activation_1_D_S_post[:,k]) + np.dot(delta_2_D_G[:,j], activation_1_D_G_post[:,k])

        dJ_D_activation_1_D_S_post = np.zeros_like(activation_1_D_S_post)

        for i in range(Batch_Size):
            for k in range(len(activation_1_D_S_post[i])):
                dJ_D_activation_1_D_S_post[i][k] = np.dot(delta_2_D_S[i], Weights_1_to_2_D[:,k])

        dJ_D_activation_1_D_G_post = np.zeros_like(activation_1_D_G_post)

        for i in range(Batch_Size):
            for k in range(len(activation_1_D_G_post[i])):
                dJ_D_activation_1_D_G_post[i][k] = np.dot(delta_2_D_G[i], Weights_1_to_2_D[:,k])

        (dJ_D_activation_1_D_S_pre, dJ_D_theta_1_D_in_S, dJ_D_beta_1_D_in_S) = Batch_Normalization_Derivative(dJ_D_activation_1_D_S_post, activation_1_D_S_hat, activation_1_D_S_pre, mu_1_D_S, sigma_2_1_D_S, theta_1_D, beta_1_D, True)
        (dJ_D_activation_1_D_G_pre, dJ_D_theta_1_D_in_G, dJ_D_beta_1_D_in_G) = Batch_Normalization_Derivative(dJ_D_activation_1_D_G_post, activation_1_D_G_hat, activation_1_D_G_pre, mu_1_D_G, sigma_2_1_D_G, theta_1_D, beta_1_D, True)

        delta_1_D_S = np.multiply(dJ_D_activation_1_D_S_pre, derivative_LeakyReLu(z_1_D_S_pre, 0.2))
        delta_1_D_G = np.multiply(dJ_D_activation_1_D_G_pre, derivative_LeakyReLu(z_1_D_G_pre, 0.2))

        dJ_D_Weights_0_to_1_D = np.zeros_like(Weights_0_to_1_D)

        for j in range(len(Weights_0_to_1_D)):
            for k in range(len(Weights_0_to_1_D[j])):
                dJ_D_Weights_0_to_1_D[j][k] = np.dot(delta_1_D_G[:,j], np.array(activation_5_G)[:,k]) + np.dot(delta_1_D_S[:,j], np.array(Transformed_Generation_Training_Set)[:,k])

        ##If we do it, backpropagation through the R1 Regularization term will happen here

        ##OK, this is the backpropagation of the Generator's Jensen Shannon Divergence through the discriminator

        dJ_G_activation_5_D_G = [0.0] * Batch_Size

        for i in range(Batch_Size):
            dJ_G_activation_5_D_G[i] = -1/Batch_Size * 1/activation_5_D_G[i] 

        gamma_5_D = np.multiply(dJ_G_activation_5_D_G, derivative_sigmoid(z_5_D_G))

        dJ_G_activation_4_D_G_post = np.matmul(gamma_5_D.reshape(Batch_Size, 1), Weights_4_to_5_D.reshape(157, 1).T)

        dJ_G_activation_4_D_G_pre = Batch_Normalization_Derivative(dJ_G_activation_4_D_G_post, activation_4_D_G_hat, activation_4_D_G_pre, mu_4_D_G, sigma_2_4_D_G, theta_4_D, beta_4_D, False)

        gamma_4_D = np.multiply(dJ_G_activation_4_D_G_pre, derivative_LeakyReLu(z_4_D_G_pre, 0.2))

        dJ_G_activation_3_D_G_post = np.zeros_like(activation_3_D_G_post)

        for i in range(len(activation_3_D_G_post)):
            for k in range(len(activation_3_D_G_post[i])):
                dJ_G_activation_3_D_G_post[i][k] = np.dot(gamma_4_D[i], Weights_3_to_4_D[:,k])

        dJ_G_activation_3_D_G_pre = Batch_Normalization_Derivative(dJ_G_activation_3_D_G_post, activation_3_D_G_hat, activation_3_D_G_pre, mu_3_D_G, sigma_2_3_D_G, theta_3_D, beta_3_D, False)

        gamma_3_D = np.multiply(dJ_G_activation_3_D_G_pre, derivative_LeakyReLu(z_3_D_G_pre, 0.2))

        dJ_G_activation_2_D_G_post = np.zeros_like(activation_2_D_G_post)

        for i in range(len(activation_2_D_G_post)):
            for k in range(len(activation_2_D_G_post[i])):
                dJ_G_activation_2_D_G_post[i][k] = np.dot(gamma_3_D[i], Weights_2_to_3_D[:,k])

        dJ_G_activation_2_D_G_pre = Batch_Normalization_Derivative(dJ_G_activation_2_D_G_post, activation_2_D_G_hat, activation_2_D_G_pre, mu_2_D_G, sigma_2_2_D_G, theta_2_D, beta_2_D, False)

        gamma_2_D = np.multiply(dJ_G_activation_2_D_G_pre, derivative_LeakyReLu(z_2_D_G_pre, 0.2))

        dJ_G_activation_1_D_G_post = np.zeros_like(activation_1_D_G_post)

        for i in range(len(activation_1_D_G_post)):
            for k in range(len(activation_1_D_G_post[i])):
                dJ_G_activation_1_D_G_post[i][k] = np.dot(gamma_2_D[i], Weights_1_to_2_D[:,k])

        dJ_G_activation_1_D_G_pre = Batch_Normalization_Derivative(dJ_G_activation_1_D_G_post, activation_1_D_G_hat, activation_1_D_G_pre, mu_1_D_G, sigma_2_1_D_G, theta_1_D, beta_1_D, False)

        gamma_1_D = np.multiply(dJ_G_activation_1_D_G_pre, derivative_LeakyReLu(z_1_D_G_pre, 0.2))

        dJ_G_activation_5_G = np.zeros_like(activation_5_G)

        for i in range(len(activation_5_G)):
            for k in range(len(activation_5_G[i])):
                dJ_G_activation_5_G[i][k] = np.dot(gamma_1_D[i], Weights_0_to_1_D[:,k])

        ##Ok, this is the backpropagation of the Generator's Jensen Shannon Divergence through the generator  

        gamma_5_G = np.multiply(dJ_G_activation_5_G, np.ones_like(z_5_G) - np.square(np.tanh((z_5_G))))

        dJ_G_Weights_4_to_5_G = np.zeros_like(Weights_4_to_5_G)

        for j in range(len(Weights_4_to_5_G)):
            for k in range(len(Weights_4_to_5_G[j])):
                dJ_G_Weights_4_to_5_G[j][k] = np.dot(gamma_5_G[:,j], activation_4_G_post[:,k])

        dJ_G_bias_5_G = np.zeros_like(bias_5_G)

        for j in range(len(bias_5_G)):
            dJ_G_bias_5_G[j] = np.sum(gamma_5_G[:,j]) 

        dJ_G_activation_4_G_post = np.zeros_like(activation_4_G_post)

        for i in range(len(activation_4_G_post)):
            for k in range(len(activation_4_G_post[i])):
                dJ_G_activation_4_G_post[i][k] = np.dot(gamma_5_G[i], Weights_4_to_5_G[:,k])

        (dJ_G_activation_4_G_pre, dJ_G_theta_4_G, dJ_G_beta_4_G) = Batch_Normalization_Derivative(dJ_G_activation_4_G_post, activation_4_G_hat, activation_4_G_pre, mu_4_G, sigma_2_4_G, theta_4_G, beta_4_G, True)

        gamma_4_G = np.multiply(dJ_G_activation_4_G_pre, derivative_LeakyReLu(z_4_G_pre, 0.2))

        dJ_G_Weights_3_to_4_G = np.zeros_like(Weights_3_to_4_G)

        for j in range(len(Weights_3_to_4_G)):
            for k in range(len(Weights_3_to_4_G[j])):
                dJ_G_Weights_3_to_4_G[j][k] = np.dot(gamma_4_G[:,j], activation_3_G_post[:,k])

        dJ_G_activation_3_G_post = np.zeros_like(activation_3_G_post)

        for i in range(len(activation_3_G_post)):
            for k in range(len(activation_3_G_post[i])):
                dJ_G_activation_3_G_post[i][k] = np.dot(gamma_4_G[i], Weights_3_to_4_G[:,k])

        (dJ_G_activation_3_G_pre, dJ_G_theta_3_G, dJ_G_beta_3_G) = Batch_Normalization_Derivative(dJ_G_activation_3_G_post, activation_3_G_hat, activation_3_G_pre, mu_3_G, sigma_2_3_G, theta_3_G, beta_3_G, True)

        gamma_3_G = np.multiply(dJ_G_activation_3_G_pre, derivative_LeakyReLu(z_3_G_pre, 0.2))

        dJ_G_Weights_2_to_3_G = np.zeros_like(Weights_2_to_3_G)

        for j in range(len(Weights_2_to_3_G)):
            for k in range(len(Weights_2_to_3_G[j])):
                dJ_G_Weights_2_to_3_G[j][k] = np.dot(gamma_3_G[:,j], activation_2_G_post[:,k])

        dJ_G_activation_2_G_post = np.zeros_like(activation_2_G_post)

        for i in range(len(activation_2_G_post)):
            for k in range(len(activation_2_G_post[i])):
                dJ_G_activation_2_G_post[i][k] = np.dot(gamma_3_G[i], Weights_2_to_3_G[:,k])

        (dJ_G_activation_2_G_pre, dJ_G_theta_2_G, dJ_G_beta_2_G) = Batch_Normalization_Derivative(dJ_G_activation_2_G_post, activation_2_G_hat, activation_2_G_pre, mu_2_G, sigma_2_2_G, theta_2_G, beta_2_G, True)

        gamma_2_G = np.multiply(dJ_G_activation_2_G_pre, derivative_LeakyReLu(z_2_G_pre, 0.2))

        dJ_G_Weights_1_to_2_G = np.zeros_like(Weights_1_to_2_G)

        for j in range(len(Weights_1_to_2_G)):
            for k in range(len(Weights_1_to_2_G[j])):
                dJ_G_Weights_1_to_2_G[j][k] = np.dot(gamma_2_G[:,j], activation_1_G_post[:,k]) 

        dJ_G_activation_1_G_post = np.zeros_like(activation_1_G_post)

        for i in range(len(activation_1_G_post)):
            for k in range(len(activation_1_G_post[i])):
                dJ_G_activation_1_G_post[i][k] = np.dot(gamma_2_G[i], Weights_1_to_2_G[:,k])

        (dJ_G_activation_1_G_pre, dJ_G_theta_1_G, dJ_G_beta_1_G) = Batch_Normalization_Derivative(dJ_G_activation_1_G_post, activation_1_G_hat, activation_1_G_pre, mu_1_G, sigma_2_1_G, theta_1_G, beta_1_G, True)

        gamma_1_G = np.multiply(dJ_G_activation_1_G_pre, derivative_LeakyReLu(z_1_G_pre, 0.2))

        dJ_G_Weights_0_to_1_G = np.zeros_like(Weights_0_to_1_G)

        for j in range(len(Weights_0_to_1_G)):
            for k in range(len(Weights_0_to_1_G[j])):
                dJ_G_Weights_0_to_1_G[j][k] = np.dot(gamma_1_G[:,j], latent_space_random_vector[:,k])

        ##OK, and we update all of our weights below: 

        g = dJ_D_Weights_4_to_5_D
        Weights_4_to_5_D_m_matrix = 0.9 * Weights_4_to_5_D_m_matrix + (1-0.9) * np.array(g)
        Weights_4_to_5_D_v_matrix = 0.999 * Weights_4_to_5_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_4_to_5_D_m_matrix/(1-0.9**r)
        v_hat = Weights_4_to_5_D_v_matrix/(1-0.999**r)
        Weights_4_to_5_D = Weights_4_to_5_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_bias_4_to_5_D
        bias_5_D_m_matrix = 0.9 * bias_5_D_m_matrix + (1-0.9) * np.array(g)
        bias_5_D_v_matrix = 0.999 * bias_5_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = bias_5_D_m_matrix/(1-0.9**r)
        v_hat = bias_5_D_v_matrix/(1-0.999**r)
        bias_5_D = bias_5_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_theta_4_D_in_S + dJ_D_theta_4_D_in_G
        theta_4_D_m_matrix = 0.9 * theta_4_D_m_matrix + (1-0.9) * np.array(g)
        theta_4_D_v_matrix = 0.999 * theta_4_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_4_D_m_matrix/(1-0.9**r)
        v_hat = theta_4_D_v_matrix /(1-0.999**r)
        theta_4_D = theta_4_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_beta_4_D_in_S + dJ_D_beta_4_D_in_G 
        beta_4_D_m_matrix = 0.9 * beta_4_D_m_matrix + (1-0.9) * np.array(g)
        beta_4_D_v_matrix = 0.999 * beta_4_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_4_D_m_matrix/(1-0.9**r)
        v_hat = beta_4_D_v_matrix /(1-0.999**r)
        beta_4_D = beta_4_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_Weights_3_to_4_D
        Weights_3_to_4_D_m_matrix = 0.9 * Weights_3_to_4_D_m_matrix + (1-0.9) * np.array(g)
        Weights_3_to_4_D_v_matrix = 0.999 * Weights_3_to_4_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_3_to_4_D_m_matrix/(1-0.9**r) 
        v_hat = Weights_3_to_4_D_v_matrix/(1-0.999**r) 
        Weights_3_to_4_D = Weights_3_to_4_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_theta_3_D_in_S + dJ_D_theta_3_D_in_G
        theta_3_D_m_matrix = 0.9 * theta_3_D_m_matrix + (1-0.9) * np.array(g)
        theta_3_D_v_matrix = 0.999 * theta_3_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_3_D_m_matrix/(1-0.9**r)
        v_hat = theta_3_D_v_matrix /(1-0.999**r)
        theta_3_D = theta_3_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_beta_3_D_in_S + dJ_D_beta_3_D_in_G 
        beta_3_D_m_matrix = 0.9 * beta_3_D_m_matrix + (1-0.9) * np.array(g)
        beta_3_D_v_matrix = 0.999 * beta_3_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_3_D_m_matrix/(1-0.9**r)
        v_hat = beta_3_D_v_matrix /(1-0.999**r)
        beta_3_D = beta_3_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_Weights_2_to_3_D
        Weights_2_to_3_D_m_matrix = 0.9 * Weights_2_to_3_D_m_matrix + (1-0.9) * np.array(g)
        Weights_2_to_3_D_v_matrix = 0.999 * Weights_2_to_3_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_2_to_3_D_m_matrix/(1-0.9**r) 
        v_hat = Weights_2_to_3_D_v_matrix/(1-0.999**r) 
        Weights_2_to_3_D = Weights_2_to_3_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_theta_2_D_in_S + dJ_D_theta_2_D_in_G
        theta_2_D_m_matrix = 0.9 * theta_2_D_m_matrix + (1-0.9) * np.array(g)
        theta_2_D_v_matrix = 0.999 * theta_2_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_2_D_m_matrix/(1-0.9**r)
        v_hat = theta_2_D_v_matrix /(1-0.999**r)
        theta_2_D = theta_2_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_beta_2_D_in_S + dJ_D_beta_2_D_in_G 
        beta_2_D_m_matrix = 0.9 * beta_2_D_m_matrix + (1-0.9) * np.array(g)
        beta_2_D_v_matrix = 0.999 * beta_2_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_2_D_m_matrix/(1-0.9**r)
        v_hat = beta_2_D_v_matrix /(1-0.999**r)
        beta_2_D = beta_2_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_Weights_1_to_2_D
        Weights_1_to_2_D_m_matrix = 0.9 * Weights_1_to_2_D_m_matrix + (1-0.9) * np.array(g)
        Weights_1_to_2_D_v_matrix = 0.999 * Weights_1_to_2_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_1_to_2_D_m_matrix/(1-0.9**r) 
        v_hat = Weights_1_to_2_D_v_matrix/(1-0.999**r) 
        Weights_1_to_2_D = Weights_1_to_2_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_theta_1_D_in_S + dJ_D_theta_1_D_in_G
        theta_1_D_m_matrix = 0.9 * theta_1_D_m_matrix + (1-0.9) * np.array(g)
        theta_1_D_v_matrix = 0.999 * theta_1_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_1_D_m_matrix/(1-0.9**r)
        v_hat = theta_1_D_v_matrix /(1-0.999**r)
        theta_1_D = theta_1_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_beta_1_D_in_S + dJ_D_beta_1_D_in_G 
        beta_1_D_m_matrix = 0.9 * beta_1_D_m_matrix + (1-0.9) * np.array(g)
        beta_1_D_v_matrix = 0.999 * beta_1_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_1_D_m_matrix/(1-0.9**r)
        v_hat = beta_1_D_v_matrix /(1-0.999**r)
        beta_1_D = beta_1_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_D_Weights_0_to_1_D
        Weights_0_to_1_D_m_matrix = 0.9 * Weights_0_to_1_D_m_matrix + (1-0.9) * np.array(g)
        Weights_0_to_1_D_v_matrix = 0.999 * Weights_0_to_1_D_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_0_to_1_D_m_matrix/(1-0.9**r) 
        v_hat = Weights_0_to_1_D_v_matrix/(1-0.999**r) 
        Weights_0_to_1_D = Weights_0_to_1_D - 0.005 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))




        g = dJ_G_Weights_4_to_5_G
        Weights_4_to_5_G_m_matrix = 0.9 * Weights_4_to_5_G_m_matrix + (1-0.9) * np.array(g)
        Weights_4_to_5_G_v_matrix = 0.999 * Weights_4_to_5_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_4_to_5_G_m_matrix/(1-0.9**r)
        v_hat = Weights_4_to_5_G_v_matrix/(1-0.999**r)
        Weights_4_to_5_G = Weights_4_to_5_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_bias_5_G
        bias_5_G_m_matrix = 0.9 * bias_5_G_m_matrix + (1-0.9) * np.array(g)
        bias_5_G_v_matrix = 0.999 * bias_5_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = bias_5_G_m_matrix/(1-0.9**r)
        v_hat = bias_5_G_v_matrix/(1-0.999**r)
        bias_5_G = bias_5_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_theta_4_G
        theta_4_G_m_matrix = 0.9 * theta_4_G_m_matrix + (1-0.9) * np.array(g)
        theta_4_G_v_matrix = 0.999 * theta_4_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_4_G_m_matrix/(1-0.9**r)
        v_hat = theta_4_G_v_matrix/(1-0.999**r)
        theta_4_G = theta_4_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_beta_4_G
        beta_4_G_m_matrix = 0.9 * beta_4_G_m_matrix + (1-0.9) * np.array(g)
        beta_4_G_v_matrix = 0.999 * beta_4_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_4_G_m_matrix/(1-0.9**r)
        v_hat = beta_4_G_v_matrix/(1-0.999**r)
        beta_4_G = beta_4_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_Weights_3_to_4_G
        Weights_3_to_4_G_m_matrix = 0.9 * Weights_3_to_4_G_m_matrix + (1-0.9) * np.array(g)
        Weights_3_to_4_G_v_matrix = 0.999 * Weights_3_to_4_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_3_to_4_G_m_matrix/(1-0.9**r)
        v_hat = Weights_3_to_4_G_v_matrix/(1-0.999**r)
        Weights_3_to_4_G = Weights_3_to_4_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_theta_3_G
        theta_3_G_m_matrix = 0.9 * theta_3_G_m_matrix + (1-0.9) * np.array(g)
        theta_3_G_v_matrix = 0.999 * theta_3_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_3_G_m_matrix/(1-0.9**r)
        v_hat = theta_3_G_v_matrix/(1-0.999**r)
        theta_3_G = theta_3_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_beta_3_G
        beta_3_G_m_matrix = 0.9 * beta_3_G_m_matrix + (1-0.9) * np.array(g)
        beta_3_G_v_matrix = 0.999 * beta_3_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_3_G_m_matrix/(1-0.9**r)
        v_hat = beta_3_G_v_matrix/(1-0.999**r)
        beta_3_G = beta_3_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_Weights_2_to_3_G
        Weights_2_to_3_G_m_matrix = 0.9 * Weights_2_to_3_G_m_matrix + (1-0.9) * np.array(g)
        Weights_2_to_3_G_v_matrix = 0.999 * Weights_2_to_3_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_2_to_3_G_m_matrix/(1-0.9**r)
        v_hat = Weights_2_to_3_G_v_matrix/(1-0.999**r)
        Weights_2_to_3_G = Weights_2_to_3_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_theta_2_G 
        theta_2_G_m_matrix = 0.9 * theta_2_G_m_matrix + (1-0.9) * np.array(g)
        theta_2_G_v_matrix = 0.999 * theta_2_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_2_G_m_matrix/(1-0.9**r)
        v_hat = theta_2_G_v_matrix/(1-0.999**r)
        theta_2_G = theta_2_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_beta_2_G 
        beta_2_G_m_matrix = 0.9 * beta_2_G_m_matrix + (1-0.9) * np.array(g)
        beta_2_G_v_matrix = 0.999 * beta_2_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_2_G_m_matrix/(1-0.9**r)
        v_hat = beta_2_G_v_matrix/(1-0.999**r)
        beta_2_G = beta_2_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_Weights_1_to_2_G 
        Weights_1_to_2_G_m_matrix = 0.9 * Weights_1_to_2_G_m_matrix + (1-0.9) * np.array(g)
        Weights_1_to_2_G_v_matrix = 0.999 * Weights_1_to_2_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_1_to_2_G_m_matrix/(1-0.9**r)
        v_hat = Weights_1_to_2_G_v_matrix/(1-0.999**r)
        Weights_1_to_2_G = Weights_1_to_2_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_theta_1_G 
        theta_1_G_m_matrix = 0.9 * theta_1_G_m_matrix + (1-0.9) * np.array(g)
        theta_1_G_v_matrix = 0.999 * theta_1_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = theta_1_G_m_matrix/(1-0.9**r)
        v_hat = theta_1_G_v_matrix/(1-0.999**r)
        theta_1_G = theta_1_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_beta_1_G 
        beta_1_G_m_matrix = 0.9 * beta_1_G_m_matrix + (1-0.9) * np.array(g)
        beta_1_G_v_matrix = 0.999 * beta_1_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = beta_1_G_m_matrix/(1-0.9**r)
        v_hat = beta_1_G_v_matrix/(1-0.999**r)
        beta_1_G = beta_1_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

        g = dJ_G_Weights_0_to_1_G 
        Weights_0_to_1_G_m_matrix = 0.9 * Weights_0_to_1_G_m_matrix + (1-0.9) * np.array(g)
        Weights_0_to_1_G_v_matrix = 0.999 * Weights_0_to_1_G_v_matrix + (1-0.999) * np.square(g)
        m_hat = Weights_0_to_1_G_m_matrix/(1-0.9**r)
        v_hat = Weights_0_to_1_G_v_matrix/(1-0.999**r)
        Weights_0_to_1_G = Weights_0_to_1_G - 0.009 * np.divide(m_hat, np.sqrt(v_hat) + 10e-8 * np.ones_like(v_hat))

print("Ok, let's see if it works!")

##Our 5's will (hopefully) be generated here  

latent_space_random_vector = np.random.normal(0, 1, 100*Batch_Size).reshape(Batch_Size, 100)

z_1_G_pre = [] 
for i in range(Batch_Size):
    z_1_G_pre.append(np.matmul(Weights_0_to_1_G, latent_space_random_vector[i]))
activation_1_G_pre = LeakyReLu(z_1_G_pre, 0.2)
activation_1_G_post = test_Batch_Normalization(activation_1_G_pre, mu_1_G_true, sigma_2_1_G_true ,theta_1_G, beta_1_G)

z_2_G_pre = []
for j in range(Batch_Size):
    z_2_G_pre.append(np.matmul(Weights_1_to_2_G, activation_1_G_post[j]))
activation_2_G_pre = LeakyReLu(z_2_G_pre, 0.2)
activation_2_G_post = test_Batch_Normalization(activation_2_G_pre, mu_2_G_true, sigma_2_2_G_true ,theta_2_G, beta_2_G)

z_3_G_pre = []
for k in range(Batch_Size):
    z_3_G_pre.append(np.matmul(Weights_2_to_3_G, activation_2_G_post[k]))
activation_3_G_pre = LeakyReLu(z_3_G_pre, 0.2)
activation_3_G_post = test_Batch_Normalization(activation_3_G_pre, mu_3_G_true, sigma_2_3_G_true ,theta_3_G, beta_3_G)

z_4_G_pre = []
for k in range(Batch_Size):
    z_4_G_pre.append(np.matmul(Weights_3_to_4_G, activation_3_G_post[k]))
activation_4_G_pre = LeakyReLu(z_4_G_pre, 0.2)
activation_4_G_post = test_Batch_Normalization(activation_4_G_pre, mu_4_G_true, sigma_2_4_G_true ,theta_4_G, beta_4_G)

z_5_G = []
for l in range(Batch_Size):
    z_5_G.append(np.matmul(Weights_4_to_5_G, activation_4_G_post[l]) + bias_5_G)
activation_5_G = np.tanh(z_5_G) 

##We will feed our generated 5's through the discriminator to test its strength 

z_1_D_G_pre = []
for i in range(Batch_Size):
    z_1_D_G_pre.append(np.matmul(Weights_0_to_1_D, activation_5_G[i]))
activation_1_D_G_pre = LeakyReLu(z_1_D_G_pre, 0.2)
activation_1_D_G_post = test_Batch_Normalization(activation_1_D_G_pre, mu_1_D_G_true, sigma_2_1_D_G_true, theta_1_D, beta_1_D)

z_2_D_G_pre = []
for j in range(Batch_Size):
    z_2_D_G_pre.append(np.matmul(Weights_1_to_2_D, activation_1_D_G_post[j]))
activation_2_D_G_pre = LeakyReLu(z_2_D_G_pre, 0.2)
activation_2_D_G_post = test_Batch_Normalization(activation_2_D_G_pre, mu_2_D_G_true, sigma_2_2_D_G_true, theta_2_D, beta_2_D)

z_3_D_G_pre = []
for k in range(Batch_Size):
    z_3_D_G_pre.append(np.matmul(Weights_2_to_3_D, activation_2_D_G_post[k]))
activation_3_D_G_pre = LeakyReLu(z_3_D_G_pre, 0.2)
activation_3_D_G_post = test_Batch_Normalization(activation_3_D_G_pre, mu_3_D_G_true, sigma_2_3_D_G_true, theta_3_D, beta_3_D)

z_4_D_G_pre = []
for k in range(Batch_Size):
    z_4_D_G_pre.append(np.matmul(Weights_3_to_4_D, activation_3_D_G_post[k]))
activation_4_D_G_pre = LeakyReLu(z_4_D_G_pre, 0.2)
activation_4_D_G_post = test_Batch_Normalization(activation_4_D_G_pre, mu_4_D_G_true, sigma_2_4_D_G_true, theta_4_D, beta_4_D)

z_5_D_G = []
for l in range(Batch_Size):
    z_5_D_G.append(np.dot(Weights_4_to_5_D, activation_4_D_G_post[l]) + bias_5_D)
activation_5_D_G = sigmoid(np.array(z_5_D_G))

print(activation_5_D_G)

##We will print our 5's here

activation_5_G = 128.5 * (activation_5_G + np.ones_like(activation_5_G)) - np.ones_like(activation_5_G)

activation_5_G = activation_5_G.reshape(Batch_Size, 28, 28).astype(int)

for i in range(len(activation_5_G)):
    print('sample number', i)
    print('\n'.join([''.join(['{:4}'.format(item) for item in row]) 
        for row in activation_5_G[i]])) 

Ok, let's see if it works!
[0.50037263 0.50239228 0.50306714 0.50284818 0.50020746 0.5062942
 0.4980926  0.49956636 0.49716391 0.49752016 0.49659351 0.50257124
 0.49922807 0.50082847 0.49688394 0.49990534 0.50281993 0.4984096
 0.49884072 0.50286089 0.49799143 0.50267568 0.48790949 0.5027447
 0.49201632 0.50201391 0.49309657 0.49721398 0.49428917 0.49715001
 0.49819584 0.49681485 0.50231043 0.50061516 0.49814084 0.50228673
 0.50165571 0.50543344 0.50259892 0.50279148 0.50457597 0.5030863
 0.50244692 0.49538976 0.49941839 0.49867048 0.50089529 0.49942859]
sample number 0
   5 255 255 255   3  28 255 255 123 255   2 255   2 173   0 255   0 233 255   0 255   0 255 255 138   0 148   0
  31 255 255  97   0   3  34   2 255 252   0 255 255   0  11   0   0 255   0 152 255 241 255   0 255   0   0  23
  60 254 255 211   0  85 103 255 254   0 255 255   0 255 255  17   0   0   0   0 255  57   0 255  12 255   6  71
 255   6   0 255  20 255 255 232 255 234   0 235   0 254   0   0 255   1   0   0   0 

In [34]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.datasets import mnist  
import math
from numpy import random
import sys

(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

np.set_printoptions(threshold=np.inf)
print(np.array(X_train[0]))



[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3  18  18  18 126 136
  175  26 166 255 247 127   0   0   0   0]
 [  0   0   0   0   0   0   0   0  30  36  94 154 170 253 253 253 253 253
  225 172 253 242 195  64   0   0   0   0]
 [  0   0   0   0   0   0   0  49 238 253 253 253 253 253 253 253 253 251
   93  82  82  56  39   0   0   0   0   0]
 [  0   0   0   0   0   0   0  18 219 253 253 253 253 253 198 18