In [176]:
# In this notebook, you learn:
#
# 1) How to train a simple neural network on a toy dataset?
# 2) How does the performance of the neural network vary based on the complexity of the dataset?

In [177]:
import math
import random

from graphviz import Digraph
from queue import Queue
from typing import Any, List, Set, Tuple, Union

In [178]:
def topological_sort(root: Any) -> List[Any]:
    visited: Set[Any] = set()
    topo_order: List[Any] = []
    def dfs(node: Any):
        if node in visited:
            return
        visited.add(node)
        if node.children:
            for child in node.children:
                dfs(child)
        topo_order.append(node)
    dfs(root)
    topo_order.reverse()
    return topo_order

In [179]:
class Value:
    def __init__(self, data: float, label: str="", operation: str=None, children: Tuple=None):
        self.data = data
        self.label = label
        self.operation = operation
        self.children = children
        self.grad = 0.0
        self.grad_propagator = None
    
    def __add__(self, other: Union['Value', float, int]) -> 'Value':
        if isinstance(other, float) or isinstance(other, int):
            other = Value(other)
        result = Value(self.data + other.data, operation="+", children=(self, other))
        def grad_propagator():
            self.grad += result.grad
            other.grad += result.grad
        result.grad_propagator = grad_propagator
        return result
    
    def __radd__(self, other: Union[float, int]) -> 'Value':
        other: Value = Value(other)
        result = Value(self.data + other.data, operation="+", children=(self, other))
        def grad_propagator():
            self.grad += result.grad
            other.grad += result.grad
        result.grad_propagator = grad_propagator
        return result
    
    def __mul__(self, other: Union['Value', float, int]) -> 'Value':
        if isinstance(other, float) or isinstance(other, int):
            other = Value(other)
        result = Value(self.data * other.data, operation="*", children=(self, other)) 
        def grad_propagator():
            self.grad += result.grad * other.data
            other.grad += result.grad * self.data
        result.grad_propagator = grad_propagator
        return result
    
    def __rmul__(self, other: Union[float, int]) -> 'Value':
        other = Value(other)
        result = Value(self.data * other.data, operation="*", children=(self, other)) 
        def grad_propagator():
            self.grad += result.grad * other.data
            other.grad += result.grad * self.data
        result.grad_propagator = grad_propagator
        return result

    def __sub__(self, other: Union['Value', float, int]) -> 'Value':
        if isinstance(other, float) or isinstance(other, int):
            other = Value(other)
        result = Value(self.data - other.data, operation=f"{self.data}-{other.data}", children=(self, other))
        def grad_propagator():
            self.grad += result.grad
            other.grad -= result.grad
        result.grad_propagator = grad_propagator
        return result

    def __rsub__(self, other: Union[float, int]) -> 'Value':
        other = Value(other)
        result = Value(other.data - self.data, operation=f"{other.data}-{self.data}", children=(other, self))
        def grad_propagator():
            self.grad -= result.grad
            other.grad += result.grad
        result.grad_propagator = grad_propagator
        return result

    def __truediv__(self, other: Union['Value', float, int]) -> 'Value':
        if isinstance(other, float) or isinstance(other, int):
            other = Value(other)
        result = Value(self.data / other.data, operation=f"{self.data}/{other.data}", children=(self, other))
        def grad_propagator():
            self.grad += result.grad / other.data
            other.grad -= ((result.grad * self.data) / other.data**2)
        result.grad_propagator = grad_propagator
        return result

    def __pow__(self, other: Union['Value', float]) -> 'Value':
        if isinstance(other, float) or isinstance(other, int):
            other = Value(other)
        result = Value(self.data ** other.data, operation=f"{self.data}^{other.data}", children=(self, other))
        def grad_propagator():
            self.grad += result.grad * other.data * self.data**(other.data - 1)
            other.grad += result.grad * result.data * math.log(self.data)
        result.grad_propagator = grad_propagator
        return result

    def __rpow__(self, other: Union[int, float]) -> 'Value':
        other = Value(other)
        result = Value(other.data ** self.data, operation=f"{other.data}^{self.data}", children=(other, self))
        def grad_propagator():
            self.grad += result.grad * result.data * math.log(other.data)
            other.grad += result.grad * self.data * other.data**(self.data - 1)
        result.grad_propagator = grad_propagator
        return result

    def tanh(self) -> 'Value':
        computed_data = (math.exp(self.data) - math.exp(-self.data))/(math.exp(self.data) + math.exp(-self.data))
        result = Value(data=computed_data, operation="tanh", children=(self,))
        def grad_propagator():
            # The derivative of tanh is (1 - tanh^2). This is used to calculate the gradients of the children nodes.
            self.grad += (1.0 - computed_data**2) * result.grad
        result.grad_propagator = grad_propagator
        return result

    def backward(self):
        topo_order: List[Value] = topological_sort(self)
        self.grad = 1.0
        for node in topo_order:
            if node.grad_propagator is None:
                continue
            node.grad_propagator()

    def __repr__(self) -> str:
        return f"label: {self.label} | data: {self.data} | operation: {self.operation} | grad: {self.grad}"

In [180]:
def get_nodes_and_edges(root: Value) -> Tuple[Set[Value], Set[Tuple[Value, Value]]]:
    """Returns all the nodes and edges in the expression tree. Does not nodes for the operations.

    Args:
        root (Value): The root node (the final output object) of the expression tree.

    Returns:
        Tuple[Set[Value], Set[Tuple[Value, Value]]]: A tuple containing the set of nodes and set 
                                                     of edges in the expression tree.
    """
    nodes = set()
    edges = set()
    visited = set()
    queue = Queue()
    queue.put(root)
    while not queue.empty():
        node = queue.get()
        if node in visited:
            continue
        visited.add(node)
        nodes.add(node)
        if node.children:
            for child in node.children:
                edges.add((child, node))
                queue.put(child)
    return nodes, edges

def get_expression_graph(root: Value) -> Digraph:
    """Returns a graph that visualizes the expression created using the Value_4 objects.

    Args:
        root (Value): The root node (the final output object) of the expression tree.

    Returns:
        Digraph: DOT language graph that visualizes the expression created using the Value objects.
    """
    dot = Digraph(name="ExpressionGraph", 
                  comment="Constructs the expression graph using the Value objects.",
                  format="png",
                  graph_attr={"rankdir": "LR"})
    nodes, edges = get_nodes_and_edges(root)
    for node in nodes:
        unique_id: str = str(id(node))
        dot.node(name=unique_id, label=f"{node.label} | data {node.data:.4f} | grad {node.grad:.4f}", shape="record")
        if node.operation:
            dot.node(name=f"{unique_id}_{node.operation}", label=node.operation)
            dot.edge(tail_name=f"{unique_id}_{node.operation}", head_name=unique_id)
    for edge in edges:
        from_node, to_node = edge
        dot.edge(tail_name=str(id(from_node)), head_name=f"{str(id(to_node))}_{to_node.operation}")
    return dot

### Trainable Parameters

In [181]:
# We will use back propagation to calculate the gradients for all the nodes in the expression graph.
# However, during training, we will only update the weights, biases, etc that are created within the neural network.
# The input data should not be updated during training. So, we need to know all the learnable parameters in the 
# expression graph. Lets add 'get_parameters' method to the Perceptron, Layer, and MultiLayerPerceptron classes to 
# get all the learnable parameters in the model.
#
# Note that we don't need to update the 'backward' method in the Value class. The gradients are calculated for the 
# inputs as well but these gradients are not used to update the input data. This will only incur the cost of 
# computation but will not affect the training process.

In [182]:
class Perceptron:
    def __init__(self, num_inputs: int):
        self.weights = [Value(data=random.uniform(a=0, b=1), label=f"weight") for i in range(num_inputs)]
        self.bias = Value(data=0.0, label="bias")

    def __call__(self, inputs: List[Union[Value, float, int]]) -> Value:
        weighted_sum = self.bias
        for input_value, weight in zip(inputs, self.weights):
            weighted_sum += input_value * weight
        output = weighted_sum.tanh()
        return output

    def get_parameters(self) -> List[Value]:
        return self.weights + [self.bias]

In [183]:
class Layer:
    def __init__(self, num_inputs: int, num_neurons: int):
        self.neurons = [Perceptron(num_inputs=num_inputs) for _ in range(num_neurons)]
    
    def __call__(self, inputs: List[Union[Value, float, int]]) -> List[Value]:
        outputs = []
        for neuron in self.neurons:
            outputs.append(neuron(inputs))
        return outputs       

    def get_parameters(self) -> List[Value]:
        layer_parameters = []
        for neuron in self.neurons:
            layer_parameters.extend(neuron.get_parameters())
        return layer_parameters

In [184]:
class MultiLayerPerceptron:
    def __init__(self, num_inputs: int, num_neurons_per_layer: List[int]):
        self.layers = [Layer(num_inputs=num_inputs, num_neurons=num_neurons_per_layer[0])]
        for i in range(1, len(num_neurons_per_layer)):
            self.layers.append(Layer(num_inputs=num_neurons_per_layer[i-1], num_neurons=num_neurons_per_layer[i]))
        
    def __call__(self, inputs: List[Union[Value, float, int]]) -> List[Value]:
        outputs = inputs
        for layer in self.layers:
            outputs = layer(outputs)
        return outputs

    def get_parameters(self) -> List[Value]:
        model_parameters = []
        for layer in self.layers:
            model_parameters.extend(layer.get_parameters())
        return model_parameters

In [185]:
# Let's create a small dummy neural network and verify that the parameters are being returned correctly.
dummy_model = MultiLayerPerceptron(num_inputs=2, num_neurons_per_layer=[3, 2])
dummy_model.get_parameters()

[label: weight | data: 0.21963017343039903 | operation: None | grad: 0.0,
 label: weight | data: 0.8568035649102921 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.9597246121444907 | operation: None | grad: 0.0,
 label: weight | data: 0.5246267035787949 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.9137742356873361 | operation: None | grad: 0.0,
 label: weight | data: 0.5920915579902988 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.004256141629178423 | operation: None | grad: 0.0,
 label: weight | data: 0.8448368796571778 | operation: None | grad: 0.0,
 label: weight | data: 0.6856695584658127 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.28195287285629833 | operation: None | grad: 0.0,
 label: weight | data: 0.36625420192198876

## Training a Neural Network

In [186]:
# We need a dataset to train our neural network. Let's use tiny datasets and train a tiny neural network.
# We don't care if the neural network overfits the data. We just want to make sure that the neural network
# is able to learn the patterns in the data.
#
# About the dataset:
# Lets have 4 data points. Each data point will have 3 features and 1 target.
# 
# About the data generation process, we want to generate a very simple dataset which can be easily 
# learned by a neural network. We can use different kinds of datasets. Lets generate multiple datasets 
# using  different functions and see if the neural network can learn the function that generated the 
# data.

In [187]:
# We will use a neural network with 2 hidden layers, each having 4 neurons and 1 output layer.
# The neurons will use tanh activation function. We will use mean squared error as the loss function.
# First, let's create the training loop and use it with all the datasets.

In [188]:
def mean_squared_error(predictions: List[Value], targets: List[int]) -> Value:
    loss = Value(0.0)
    for prediction, target in zip(predictions, targets):
        # Do not use the square operator for the loss calculation. This will causes
        # in the gradient calculation. Full explanation can be found in the next cell.
        # loss += (prediction - target)**2
        loss += (prediction - target) * (prediction - target)
    return loss / len(targets)

In [189]:
# During the calculation of mean squared error, I noticed that there is an interesting issue we can might
# face using the exponentation (pow) in gradient calculation. There are a few issues with this operation.
# I will highlight two of them:
#
# 1) a^x is not always a real number. For example, (-4)^0.5 is not a complex number. This will result in
#    a complex number in the model calculation which is not desired and can lead to unexpected results.
#    Moreover, it can be extremely difficult to debug such issues.
# 2) The gradient of a^x wrt to x is 'a^x * ln(a)'. Such derivatives will comeup in the gradient calculation
#    when we try to compute the partial derivatives wrt to the learnable parameters. We assume a is 
#    constant and x is the variable. However, as you see, the derivative is 'a^x * ln(a)'. This means it
#    is not defined (atleast not in real numbers) when a is negative. This can lead to issues in the 
#    model training.
#
# Hence, it is good to be cautious when using the power operation in the neural network models.

In [190]:
def zero_gradients(model: MultiLayerPerceptron):
    for parameter in model.get_parameters():
        parameter.grad = 0.0
    
def update_parameters(model: MultiLayerPerceptron, learning_rate: float):
    for parameter in model.get_parameters():
        parameter.data -= learning_rate * parameter.grad

In [191]:
def train_model(model: MultiLayerPerceptron, inputs: List[List[float]], targets: List[int], num_epochs: int, learning_rate: float):
    for epoch in range(num_epochs):
        print(f"epoch: {epoch}")
        predictions = []
        for input in inputs:
            predictions.append(model(input)[0])
        print("predictions:")
        for prediction in predictions:
            print(prediction)
        epoch_loss = mean_squared_error(predictions, targets)
        
        # The gradients should always be zeroed out before each backpropagation step.
        # Otherwise, the gradients from previous epoch will accumulate.
        zero_gradients(model=model)
        epoch_loss.backward()
        update_parameters(model=model, learning_rate=learning_rate)
        print(f"epoch_loss: {epoch_loss.data}")
        print("-" * 150)

### Even-Odd function

In [192]:
neural_network1 = MultiLayerPerceptron(num_inputs=3, num_neurons_per_layer=[4, 4, 1])
neural_network1.get_parameters()

[label: weight | data: 0.9166153880414356 | operation: None | grad: 0.0,
 label: weight | data: 0.44800305292484743 | operation: None | grad: 0.0,
 label: weight | data: 0.33701992037806805 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.7751071178425 | operation: None | grad: 0.0,
 label: weight | data: 0.7831780403897577 | operation: None | grad: 0.0,
 label: weight | data: 0.42712410134571444 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.4614718421959383 | operation: None | grad: 0.0,
 label: weight | data: 0.49435899515868453 | operation: None | grad: 0.0,
 label: weight | data: 0.763503834968903 | operation: None | grad: 0.0,
 label: bias | data: 0.0 | operation: None | grad: 0.0,
 label: weight | data: 0.1399716942300222 | operation: None | grad: 0.0,
 label: weight | data: 0.5280850057552526 | operation: None | grad: 0.0,
 label: weight | data: 0.1701

In [193]:
# Let's use a function that predicts 1 if the sum of the features is even and -1 if the sum of the features is odd.
# Let's see if the neural network can learn this function.

In [194]:
# data_point1 = [1, 1, 1]  => sum = 3 (odd)  => target = -1
# data_point2 = [0, 2, 4]  => sum = 6 (even) => target = 1
# data_point3 = [-2, 4, 2] => sum = 4 (even) => target = 1
# data_point4 = [3, 1, 3]  => sum = 7 (odd)  => target = -1
inputs1 = [[1, 1, 1], [0, 2, 4], [-2, 4, 2], [3, 1, 3]]
targets1 = [-1, 1, 1, -1]

In [195]:
initial_predictions1 = [neural_network1(input)[0] for input in inputs1]
initial_predictions1

[label:  | data: 0.9725899316515488 | operation: tanh | grad: 0.0,
 label:  | data: 0.9753202140383759 | operation: tanh | grad: 0.0,
 label:  | data: 0.9663047170258061 | operation: tanh | grad: 0.0,
 label:  | data: 0.9755487777739069 | operation: tanh | grad: 0.0]

In [196]:
# Note that the data at the end of 500 epochs are almost the same as the targets. This means that the neural network
# has learned the function that generated the data. This is a very simple function but it is good to see that the
# neural network is able to learn this function.
train_model(model=neural_network1, inputs=inputs1, targets=targets1, num_epochs=500, learning_rate=0.1)

epoch: 0
predictions:
label:  | data: 0.9725899316515488 | operation: tanh | grad: 0.0
label:  | data: 0.9753202140383759 | operation: tanh | grad: 0.0
label:  | data: 0.9663047170258061 | operation: tanh | grad: 0.0
label:  | data: 0.9755487777739069 | operation: tanh | grad: 0.0
epoch_loss: 1.9489121189367156
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 1
predictions:
label:  | data: 0.9697381733178587 | operation: tanh | grad: 0.0
label:  | data: 0.9727190984979154 | operation: tanh | grad: 0.0
label:  | data: 0.9627824261660209 | operation: tanh | grad: 0.0
label:  | data: 0.9729767518969389 | operation: tanh | grad: 0.0
epoch_loss: 1.943658782585056
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 2
predictions:
label:  | data: 0.9662498973971859 | operation: tan

#### Even-Odd function using a Perceptron

In [215]:
perceptron = Perceptron(num_inputs=3)

In [216]:
# data_point1 = [1, 1, 1]  => sum = 3 (odd)  => target = -1
# data_point2 = [0, 2, 4]  => sum = 6 (even) => target = 1
# data_point3 = [-2, 4, 2] => sum = 4 (even) => target = 1
# data_point4 = [3, 1, 3]  => sum = 7 (odd)  => target = -1
inputs1_copy = [[1, 1, 1], [0, 2, 4], [-2, 4, 2], [3, 1, 5]]
targets1_copy = [-1, 1, 1, -1]

In [217]:
initial_predictions1_copy = [perceptron(input) for input in inputs1_copy]
initial_predictions1_copy

[label:  | data: 0.8560599376576918 | operation: tanh | grad: 0.0,
 label:  | data: 0.9929215849128291 | operation: tanh | grad: 0.0,
 label:  | data: 0.9991924732353679 | operation: tanh | grad: 0.0,
 label:  | data: 0.9869412952746284 | operation: tanh | grad: 0.0]

In [None]:
# Perceptron also performs well on this simple dataset. The predictions are very close to the targets.
# However, the previous model was slightly better than the perceptron model. This is expected as the previous
# model was a neural network with 2 hidden layers. The perceptron model is a single neuron model.
num_epochs = 500
for epoch in range(num_epochs):
    print(f"epoch: {epoch}")
    predictions = []
    for input in inputs1_copy:
        predictions.append(perceptron(input))
    print("predictions:")
    for prediction in predictions:
        print(prediction)
    epoch_loss = mean_squared_error(predictions, targets1_copy)
    
    # The gradients should always be zeroed out before each backpropagation step.
    # Otherwise, the gradients from previous epoch will accumulate.
    zero_gradients(model=perceptron)
    epoch_loss.backward()
    update_parameters(model=perceptron, learning_rate=0.1)
    print(f"epoch_loss: {epoch_loss.data}")
    print("-" * 150)

epoch: 0
predictions:
label:  | data: 0.8560599376576918 | operation: tanh | grad: 0.0
label:  | data: 0.9929215849128291 | operation: tanh | grad: 0.0
label:  | data: 0.9991924732353679 | operation: tanh | grad: 0.0
label:  | data: 0.9869412952746284 | operation: tanh | grad: 0.0
epoch_loss: 1.8482362397762788
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 1
predictions:
label:  | data: 0.8189074109515043 | operation: tanh | grad: 0.0
label:  | data: 0.9887493488787276 | operation: tanh | grad: 0.0
label:  | data: 0.9989164496204915 | operation: tanh | grad: 0.0
label:  | data: 0.9743553517948288 | operation: tanh | grad: 0.0
epoch_loss: 1.8016577440018162
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 2
predictions:
label:  | data: 0.7546939057774749 | operation: ta

Now, let's try different functions of varying complexity and see how the same Neural Network architecture performs.

### Simple Function

$$f(x, y, z) = x + y - z$$

In [197]:
neural_network2 = MultiLayerPerceptron(num_inputs=3, num_neurons_per_layer=[4, 4, 1])

In [198]:
# data_point1 = [1, 1, 1]  => x + y - z = 1 + 1 - 1  => target = 1
# data_point2 = [0, 2, 4]  => x + y - z = 0 + 2 - 4  => target = -2
# data_point3 = [-2, 4, 2] => x + y - z = -2 + 4 - 2 => target = 0
# data_point4 = [3, 1, 5]  => x + y - z = 3 + 1 - 5  => target = -1
inputs2 = [[1, 1, 1], [0, 2, 4], [-2, 4, 2], [3, 1, 5]]
targets2 = [1, -2, 0, -1]

In [199]:
initial_predictions2 = [neural_network2(input)[0] for input in inputs2]
initial_predictions2

[label:  | data: 0.9678275656684154 | operation: tanh | grad: 0.0,
 label:  | data: 0.9721701909349519 | operation: tanh | grad: 0.0,
 label:  | data: 0.9715426531056797 | operation: tanh | grad: 0.0,
 label:  | data: 0.9722274839234522 | operation: tanh | grad: 0.0]

In [200]:
# Note the the loss is not decreasing beyond 0.25 in this case. The neural network is able to learn the function
# to some extent but it is not able to learn the function completely. There could be multiple reasons for this:
#
# 1) This function is not as simple as the previous function. The neural network might need more neurons or more
#    layers to learn this function.
# 2) The data is not enough. The neural network might need more data to learn this particular function.
train_model(model=neural_network2, inputs=inputs2, targets=targets2, num_epochs=500, learning_rate=0.1)

epoch: 0
predictions:
label:  | data: 0.9678275656684154 | operation: tanh | grad: 0.0
label:  | data: 0.9721701909349519 | operation: tanh | grad: 0.0
label:  | data: 0.9715426531056797 | operation: tanh | grad: 0.0
label:  | data: 0.9722274839234522 | operation: tanh | grad: 0.0
epoch_loss: 3.417101771139946
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 1
predictions:
label:  | data: 0.962736033935928 | operation: tanh | grad: 0.0
label:  | data: 0.967703351465742 | operation: tanh | grad: 0.0
label:  | data: 0.9669840552610888 | operation: tanh | grad: 0.0
label:  | data: 0.9677684418399695 | operation: tanh | grad: 0.0
epoch_loss: 3.403955647324576
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 2
predictions:
label:  | data: 0.95589358693808 | operation: tanh | g

### Slightly Complex Function

$$f(x, y, z) = (x^2 + y) * (z + 1)$$

In [201]:
neural_network3 = MultiLayerPerceptron(num_inputs=3, num_neurons_per_layer=[4, 4, 1])

In [202]:
# data_point1 = [1, 1, 1]  => (x^2 + y) * (z + 1) = (1 + 1) * (1 + 1) => target = 4
# data_point2 = [0, 2, 4]  => (x^2 + y) * (z + 1) = (0 + 2) * (4 + 1) => target = 10
# data_point3 = [-2, 4, 2] => (x^2 + y) * (z + 1) = (4 - 4) * (2 + 1) => target = 0
# data_point4 = [3, 1, 2]  => (x^2 + y) * (z + 1) = (9 + 1) * (2 + 1) => target = 30
inputs3 = [[1, 1, 1], [0, 2, 4], [2, -4, 2], [3, 1, 2]]
targets3 = [4, 10, 0, 30]

In [203]:
initial_predictions3 = [neural_network3(input)[0] for input in inputs3]
initial_predictions3

[label:  | data: 0.9628255159712698 | operation: tanh | grad: 0.0,
 label:  | data: 0.9684080085104759 | operation: tanh | grad: 0.0,
 label:  | data: -0.9501289670734312 | operation: tanh | grad: 0.0,
 label:  | data: 0.9666568915540642 | operation: tanh | grad: 0.0]

In [204]:
# Notice that the neural network is not able to learn the function at all. This function might be way too
# complex for this neural network to all.
train_model(model=neural_network3, inputs=inputs3, targets=targets3, num_epochs=100, learning_rate=1)

epoch: 0
predictions:
label:  | data: 0.9628255159712698 | operation: tanh | grad: 0.0
label:  | data: 0.9684080085104759 | operation: tanh | grad: 0.0
label:  | data: -0.9501289670734312 | operation: tanh | grad: 0.0
label:  | data: 0.9666568915540642 | operation: tanh | grad: 0.0
epoch_loss: 233.65795996349752
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 1
predictions:
label:  | data: 0.9999998874667465 | operation: tanh | grad: 0.0
label:  | data: 0.9999998960044133 | operation: tanh | grad: 0.0
label:  | data: -0.9995868347154776 | operation: tanh | grad: 0.0
label:  | data: 0.9999998953813926 | operation: tanh | grad: 0.0
epoch_loss: 232.99979561378396
------------------------------------------------------------------------------------------------------------------------------------------------------
epoch: 2
predictions:
label:  | data: 0.9999998872690272 | operation: 