In [None]:
import torch
import torch.nn as nn

## Model

In [None]:
class MLP(nn.Module):
    def __init__(self,
                 input_size: int,
                 hidden_layers: list = [128, 64],
                 output_size: int = 2,
                 activation: str = "relu",
                 dropout: float = 0.0,
                 normalization: str = None,   # "batch", "layer" o None
                 skip_connections: bool = False):
        super(MLP, self).__init__()

        # Selección de función de activación
        activations = {
            "relu": nn.ReLU,
            "tanh": nn.Tanh,
            "leakyrelu": nn.LeakyReLU,
            "gelu": nn.GELU
        }
        act_fn = activations.get(activation.lower(), nn.ReLU)

        layers = []
        in_features = input_size
        self.skip_connections = skip_connections
        self.hidden_dims = hidden_layers

        # Construcción dinámica de capas ocultas
        for hidden_dim in hidden_layers:
            block = []
            block.append(nn.Linear(in_features, hidden_dim))

            # Normalización opcional
            if normalization == "batch":
                block.append(nn.BatchNorm1d(hidden_dim))
            elif normalization == "layer":
                block.append(nn.LayerNorm(hidden_dim))

            block.append(act_fn())

            if dropout > 0:
                block.append(nn.Dropout(dropout))

            layers.append(nn.Sequential(*block))
            in_features = hidden_dim

        self.hidden_blocks = nn.ModuleList(layers)
        self.output_layer = nn.Linear(in_features, output_size)

    def forward(self, x):
        out = x
        prev_out = None

        for i, block in enumerate(self.hidden_blocks):
            new_out = block(out)
            if self.skip_connections and prev_out is not None and new_out.shape == prev_out.shape:
                out = new_out + prev_out  # skip connection
            else:
                out = new_out
            prev_out = out

        out = self.output_layer(out)
        return out


# Model 

In [None]:
#------------------------------------------------------------------------------
#   Linear layer (Dense, Fully connected, Single Layer Perceptron)
#------------------------------------------------------------------------------
class Linear(Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.W = np.random.randn(out_features, in_features)
        self.dW = np.zeros_like(self.W) # Watch-out for the shape - it has to be same as W
        self.b = np.zeros((out_features, 1))
        self.db = np.zeros_like(self.b) # Watch-out for the shape - it has to be same as b

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_inputs = input
        self.m = self.fw_inputs.shape[1]
        net = np.matmul(self.W, input) + self.b
        return net

    def backward(self, dz: np.ndarray) -> np.ndarray:
        self.dW = (1.0/self.m) * np.matmul(dz, self.fw_inputs.T)
        self.db = (1.0/self.m) * np.sum(dz, axis=1, keepdims=True)
        return np.matmul(self.W.T, dz)

    def get_optimizer_context(self):
        return [[self.W, self.dW], [self.b, self.db]]

    def set_optimizer_context(self, params):
        self.W, self.b = params

#------------------------------------------------------------------------------
#   Model class
#------------------------------------------------------------------------------
class Model(Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, input) -> np.ndarray:
        for name, module in self.modules.items():
            input = module(input)
        return input

    def backward(self, dz: np.ndarray):
        for name, module in reversed(self.modules.items()):
            dz = module.backward(dz)

# Activation Functions

In [None]:
#------------------------------------------------------------------------------
#   SigmoidActivationFunction class
#------------------------------------------------------------------------------
class Sigmoid(Module):
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return 1.0 / (1.0 + np.exp(-input))

    def backward(self, da) -> np.ndarray:
        a = self(self.fw_input)
        return np.multiply(da, np.multiply(a, 1 - a))

#------------------------------------------------------------------------------
#   HyperbolicTangentActivationFunction class
#------------------------------------------------------------------------------
class Tanh(Module):
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return (np.exp(2 * input) - 1) / (np.exp(2 * input) + 1)

    def backward(self, da) -> np.ndarray:
        a = self(self.fw_input)
        return np.multiply(da, 1 - np.square(a))

# Loss Functions

In [None]:
#------------------------------------------------------------------------------
#   MeanSquareErrorLossFunction class
#------------------------------------------------------------------------------
class MSELoss(Module):
    def __init__(self, reduce="mean"):
        super(MSELoss, self).__init__()
        if reduce == "mean":
            self.reduce_fn = np.mean
        elif reduce == "sum":
            self.reduce_fn = np.sum
        elif reduce is None:
            # return identity (do nothing)
            self.reduce_fn = lambda x : x
        else:
            raise AttributeError

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return self.reduce_fn(np.power(target - input, 2))

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return -2 * (target - input)


#------------------------------------------------------------------------------
#   BinaryCrossEntropyLossFunction class
#------------------------------------------------------------------------------
class BCELoss(Module):
    def __init__(self, reduce="mean"):
        super(BCELoss, self).__init__()
        if reduce == "mean":
            self.reduce_fn = np.mean
        elif reduce == "sum":
            self.reduce_fn = np.sum
        elif reduce is None:
            # return identity (do nothing)
            self.reduce_fn = lambda x : x
        else:
            raise AttributeError

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return self.reduce_fn(-(target * np.log(input) + np.multiply((1 - target), np.log(1 - input))))

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        return -np.divide(target, input) + np.divide(1 - target, 1 - input)

# Optimizers

In [None]:
#------------------------------------------------------------------------------
#   SGDMomentumOptimizer class
#------------------------------------------------------------------------------
class SGDMomentum(Optimizer):
    def __init__(self, lr, beta):
        super(SGDMomentum, self).__init__()
        self.context = {}
        # >>>> start_solution
        self.lr = lr
        self.beta = beta
        # <<<< end_solution

    def step(self, model):
        for name, layer in model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW],[b,db]] = params

                    # >>>> start_solution

                    # Initialize
                    if (not name in self.context.keys()):
                        self.context[name] = {
                            # ??? Something reasonable should be here !
                            'V': [0, 0]
                        }

                    VdW, Vdb = self.context[name]['V']

                    VdW = (1 - self.beta)*dW + self.beta*VdW
                    Vdb = (1 - self.beta)*db + self.beta*Vdb

                    # Update our exponential averages
                    self.context[name]['V'] =  [VdW, Vdb] # Some kind of computation

                    # Update parameters
                    W = W - self.lr * VdW # Each optimizer is a little bit different !
                    b = b - self.lr * Vdb # Each optimizer is a little bit different !
                        
                    # <<<< end_solution
                    layer.set_optimizer_context([W,b])

In [None]:
#------------------------------------------------------------------------------
#   RMSpropOptimizer class
#------------------------------------------------------------------------------
class RMSprop(Optimizer):
    def __init__(self, lr, beta):
        super(RMSprop, self).__init__()
        self.context = {}
        # >>>> start_solution
        self.lr = lr
        self.beta = beta
        # <<<< end_solution

    def step(self, model):
        for name, layer in model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW], [b, db]] = params

                    # >>>> start_solution

                    # Initialize
                    if (not name in self.context.keys()):
                        self.context[name] = {
                            'S': [0, 0]# ??? Something reasonable should be here !
                        }
                        
                    SdW, Sdb = self.context[name]['S']

                    SdW = (1 - self.beta)*(dW**2) + self.beta*SdW
                    Sdb = (1 - self.beta)*(db**2) + self.beta*Sdb

                    # Update our exponential averages
                    self.context[name]['S'] =  [SdW, Sdb] # Some kind of computation

                    # Update parameters
                    W = W - self.lr * (dW/(SdW**0.5)) # Each optimizer is a little bit different !
                    b = b - self.lr * (db/(Sdb**0.5)) # Each optimizer is a little bit different !

                    # <<<< end_solution
                    layer.set_optimizer_context([W, b])


In [None]:
#------------------------------------------------------------------------------
#   AdamOptimizer class
#------------------------------------------------------------------------------
class Adam(Optimizer):
    def __init__(self, lr, betaV, betaS):
        super(Adam, self).__init__()
        self.context = {}
        # >>>> start_solution
        self.lr = lr
        self.betaV = betaV
        self.betaS = betaS
        self.epsilon = 1e-8
        self.t = 0
        # <<<< end_solution

    def step(self, model):
        # >>>>>> Probably add something here ;)

        # <<<<<< until here
        for name, layer in model.modules.items():
            if hasattr(layer, 'get_optimizer_context'):
                params = layer.get_optimizer_context()
                if params is not None:
                    [[W, dW], [b, db]] = params

                    # >>>> start_solution

                    # Initialize
                    if (not name in self.context.keys()):
                        self.context[name] = {
                            'V': [0, 0],
                            'S': [0, 0]  
                        }

                    VdW, Vdb = self.context[name]['V']
                    SdW, Sdb = self.context[name]['S']

                    VdW = (1 - self.betaV)*dW + self.betaV*VdW
                    Vdb = (1 - self.betaV)*db + self.betaV*Vdb
                    SdW = (1 - self.betaS)*(dW**2) + self.betaS*SdW
                    Sdb = (1 - self.betaS)*(db**2) + self.betaS*Sdb
                    

                    # Update our exponential averages
                    self.context[name]['V'] =  [VdW, Vdb] # Some kind of computation
                    self.context[name]['S'] =  [SdW, Sdb]

                    self.t += 1

                    # Adam does something with the averages
                    VdW_ = VdW/(1 - self.betaV**self.t)
                    SdW_ = SdW/(1 - self.betaS**self.t)
                    Vdb_ = Vdb/(1 - self.betaV**self.t)
                    Sdb_ = Sdb/(1 - self.betaS**self.t)

                    # Update parameters
                    W = W - self.lr * (VdW_/(SdW_**0.5 + self.epsilon)) # Each optimizer is a little bit different !
                    b = b - self.lr * (Vdb_/(Sdb_**0.5 + self.epsilon)) # Each optimizer is a little bit different !

                    # <<<< end_solution
                    layer.set_optimizer_context([W, b])
