# Neural Network for binary selection 

##### The problem consints in selecting accepitance of 2d arrays. The data is from the "microchip acceptance" problem.

### We will start with data and paramiters arrange, then present the functions implemented and finish with the training and cloncusions!

#### First, we need to import the used libraries:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import optimize
import os

#### Next, we have to import and conditionate the data:

In [2]:
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "classification2.txt")

try:
    with open(data_path, "r") as input_file:
        training_data = input_file.readlines()
except FileNotFoundError:
    print(f"Error: Could not find {data_path}")
    print(f"Current working directory: {os.getcwd()}")
training_data = [line.strip().split(",") for line in training_data]
input_file.close()
input_data = np.array([[float(data[0]), float(data[1])] for data in training_data], dtype=np.float32)
X = input_data
Y = np.array([float(data[2]) for data in training_data], dtype=np.float32).reshape(-1, 1)

#### We can also (CHANGE THIS:) "conditionate" X values:

In [3]:
X_mean = input_data.mean(axis=0)
X_std = input_data.std(axis=0)
data = (input_data - X_mean) / (X_std + 1e-8)
X = data

#### Hyperparamiters and paramiters:

In [4]:
def xavier_init(n_out, n_in):
    return np.random.randn(n_out, n_in) * np.sqrt(1.0 / n_in)

In [5]:
M = len(X) 
init_epsilon = 1e-2
learning_rate = 0.0001
epochs = 10000
lbd = 0.1

In [6]:
#Run this for the first NN
shapes = [(10, 3), (7, 11), (1, 8)] #it's the form of the NN


In [7]:
#Run this for the second NN 
shapes = [(6, 3), (4, 7), (1, 5)] #it's the form of the NN

In [8]:
theta0 = xavier_init(shapes[0][0],shapes[0][1])  
theta1 = xavier_init(shapes[1][0],shapes[1][1])
theta2 = xavier_init(shapes[2][0],shapes[2][1])  

W = [theta0, theta1, theta2]
n_layers = len(W) + 1

##### Unrolled tethas will be used for op.minimize implemantation 

In [9]:
theta = np.concatenate([w.flatten() for w in W])
initial_theta = np.hstack(theta) 

In [10]:
## initialize theta with provided fnction
#TODO

#### Now we can start with the functions. Activation used will be sigmoid, error is binary selection error (cross entropy):

In [11]:
def sigmoid(z, derivative=False):
    sig = 1 / (1 + np.exp(-z))
    if derivative:
        return z * (1 - z)
    return sig

def bin_logistic_error(t,y):
    eps = 1e-8
    return -np.mean(y * np.log(t + eps) + (1 - y) * np.log(1 - t + eps))

###### 

#### The gradient will be checked for ensure it's worth, for simplification a modified version of gradient descent was created:

#### Forward propagation:

In [12]:
def forward_prop(X, W, l, activation_func=sigmoid):
    Z = []
    A = [X.reshape(-1)]  # Ensure it's a flat vector
    AWB = [np.insert(X, 0, 1)]  # Add bias term to input

    for i in range(l - 1):
        z = np.dot(W[i], AWB[i])
        Z.append(z)
        a = activation_func(z)
        A.append(a)
        AWB.append(np.insert(a, 0, 1))  # Add bias for next layer

    return A, AWB

#### Backward propagation:

In [13]:
def backward_prop( Y, A, AWB, W, l, activation_derivative=sigmoid):
    gradients = []
    deltas = []

    # Output layer
    a_output = A[-1]                  # (n_output,)
    a_prev = AWB[-2]                  # (n_hidden + 1,)
    error = a_output - Y              # (n_output,)
    delta = error.reshape(-1, 1) * sigmoid(a_output, derivative=True)    # (n_output, 1)
    grad = np.dot(delta, a_prev.reshape(1, -1))  # (n_output, n_hidden + 1)
    deltas.insert(0, delta)
    gradients.insert(0, grad)

    # Hidden layers
    for layer in range(l - 2, 0, -1):
        w_next = W[layer]
        w_next_no_bias = w_next[:, 1:]
        delta_next = deltas[0]

        a_curr = A[layer]
        act_deriv = activation_derivative(a_curr, derivative=True).reshape(-1, 1)

        delta = np.dot(w_next_no_bias.T, delta_next) * act_deriv
        deltas.insert(0, delta)

        a_prev = AWB[layer - 1]
        grad = np.dot(delta, a_prev.reshape(1, -1))
        gradients.insert(0, grad)

    return gradients

In [14]:
def J(W, X, Y, error_func=bin_logistic_error):
    lbd = 0  # Regularization parameter
    m = len(Y)  # Number of training examples

    shapes = [(10, 3), (7, 11), (1, 8)]
    sizes = [np.prod(shape) for shape in shapes]
    split_points = np.cumsum([0] + sizes)
    reshaped_W = [
        W[split_points[i]:split_points[i+1]].reshape(shapes[i])
        for i in range(len(shapes))
    ]

    # Get the output layer activations for all training examples
    A = []
    for i in range(m):
        activations, _ = forward_prop(X[i], reshaped_W, len(reshaped_W) + 1)
        A.append(activations[-1])  # Output layer activation

    # Calculate the cost
    cost = error_func(np.array(A), Y) / (2 * m)

    # Regularization term
    if lbd > 0:
        reg_sum = 0
        for i in range(len(W)):
            # Exclude bias terms from regularization
            reg_sum += np.sum(W[i][:, 1:] ** 2)  # Sum of squares of non-bias weights
        reg_term = (lbd / (2 * m)) * reg_sum
        cost += reg_term

    return cost

def gradient_validation(theta, X, Y, epsilon=1e-5):
    grad = np.zeros_like(theta)
    for i in range(len(theta)):
        theta_eps1 = np.copy(theta)
        theta_eps2 = np.copy(theta)
        theta_eps1[i] += epsilon
        theta_eps2[i] -= epsilon
        loss1 = J(theta_eps1, X, Y)
        loss2 = J(theta_eps2, X, Y)
        grad[i] = (loss1 - loss2) / (2 * epsilon)
    return grad

def gradient_descent_for_grad_check(X, Y, w: list,lbd=0):
    M = len(X)
    n_layers = len(w) + 1
    W = [wi.copy() for wi in w]

    dW_total = [np.zeros_like(wi) for wi in W]
    total_loss = 0

    for inp in range(M):
        A, AWB = forward_prop(X[inp], W, n_layers)
        y_hat = float(A[-1])
        y_true = float(Y[inp])
        # Compute loss
        loss = -y_true * np.log(y_hat + 1e-8) - (1 - y_true) * np.log(1 - y_hat + 1e-8)
        total_loss += loss

        dW = backward_prop(Y[inp], A, AWB, W, n_layers)
        for i in range(len(W)):
            dW_total[i] += dW[i]

    grad_list = []
    for i in range(len(W)):
        grad = dW_total[i] / M

        grad[:, 1:] += lbd * W[i][:, 1:]  # regularize only non-bias
        grad_list.append(grad)
            

    return grad_list
