In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## 1.	Apply the logistic regression method using the functions in the notebook «Logistic Regression as a Neural Network – BP alg.ipynb” to predict the biological response of a molecule 

> Data: bioresponse.csv
> Description from Kaggle: “The data is in the comma separated values (CSV) format. Each row in this data set represents a molecule. The first column contains experimental data describing a real biological response; the molecule was seen to elicit this response (1), or not (0). The remaining columns represent molecular descriptors (d1 through d1776), these are caclulated properties that can capture some of the characteristics of the molecule - for example size, shape, or elemental constitution. The descriptor matrix has been normalized.”).

Use 75% of the dataset to train the model, and the rest of the data to estimate its accuracy.

In [2]:
bio_df = pd.read_csv("/data/notebook_files/bioresponse.csv")
bio_df.columns

Index(['Activity', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9',
       ...
       'D1767', 'D1768', 'D1769', 'D1770', 'D1771', 'D1772', 'D1773', 'D1774',
       'D1775', 'D1776'],
      dtype='object', length=1777)

In [3]:
X = np.array(bio_df.drop(columns=["Activity"]))
y = np.array(bio_df["Activity"])

print(f"X shape:\t{X.shape}\nY shape:\t{y.shape}")

X shape:	(3751, 1776)
Y shape:	(3751,)


In [4]:
# Use 75% of the dataset to train the mode
calc_test_size = (100 - 75) / 100
calc_test_size

0.25

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=calc_test_size, 
    random_state=42
)

In [6]:
print(f"X_train shape:\t{X_train.shape}\nY_train shape:\t{y_train.shape}")
print(f"X_test shape:\t{X_test.shape}\nY_test shape:\t{y_test.shape}")

X_train shape:	(2813, 1776)
Y_train shape:	(2813,)
X_test shape:	(938, 1776)
Y_test shape:	(938,)


In [7]:
X_train = X_train.T
y_train = y_train.reshape((1, -1))

X_test = X_test.T
y_test = y_test.reshape((1, -1))

In [8]:
print(f"X_train shape:\t{X_train.shape}\nY_train shape:\t{y_train.shape}")
print(f"X_test shape:\t{X_test.shape}\nY_test shape:\t{y_test.shape}")

X_train shape:	(1776, 2813)
Y_train shape:	(1, 2813)
X_test shape:	(1776, 938)
Y_test shape:	(1, 938)


In [9]:
# Functions from Logistic Regression as a Neural Network – BP alg.ipynb
def sigmoid(z):
    s = 1./(1.+np.exp(-z))
    
    return s


def initialize_with_zeros(dim):
    w = np.zeros((dim,1))
    b = 0.
    
    return w, b


def propagate(w, b, X, Y):
    m = X.shape[1]
    #print('number of objects = ',len(X))
    
    # FORWARD PROPAGATION (FROM X TO COST)
    A = sigmoid(np.dot(w.T,X)+b )                                 # compute activation
    cost = -(1./m)*np.sum(Y*np.log(A)+(1-Y)*np.log(1-A),axis=1)   # compute cost
    
    # BACKWARD PROPAGATION (TO FIND GRAD)
    dw = (1./m)*np.dot(X,(A-Y).T)
    db = (1./m)*np.sum(A-Y,axis=1)

    grads = {"dw": dw,
             "db": db}
    
    return grads, cost


def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    costs = []
    
    for i in range(num_iterations):
                
        # Cost and gradient calculation 
        grads, cost = propagate(w,b,X,Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # update rule
        w -=learning_rate*dw
        b -=learning_rate*db
        
        # Record the costs
        if i % 100 == 0:
            costs.append(cost[0])
        
        # Print the cost every 100 training iterations
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost[0]))
    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    return params, grads, costs


def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    # Compute vector "A" predicting the probabilities 
    A = sigmoid(np.dot(w.T,X)+b)
    
    for i in range(A.shape[1]):     
        # Convert probabilities A[0,i] to actual predictions p[0,i]
        if (A[0,i]<=0.5):
            Y_prediction[0][i]=0
        else:
            Y_prediction[0][i]=1
    
    return Y_prediction


def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False, optimization_algorithm=optimize):   
    # initialize parameters with zeros 
    w, b = initialize_with_zeros(X_train.shape[0])

    # Gradient descent
    parameters, grads, costs = optimization_algorithm(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    # Retrieve parameters w and b from dictionary "parameters"
    w = parameters["w"]
    b = parameters["b"]
    
    # Predict test/train set examples
    Y_prediction_test = predict(w, b, X_test)
    Y_prediction_train = predict(w, b, X_train)

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))

    d = {"costs": costs,
         "Y_prediction_test": Y_prediction_test, 
         "Y_prediction_train" : Y_prediction_train, 
         "w" : w, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}
    
    return d

In [10]:
# From the Lecture
w, b, X, Y = np.array([[1.],[-1.]]),\
             4.,\
             np.array([[1.,5.,-1.],[10.,0.,-3.2]]),\
             np.array([[0,1,1]])

grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))

dw = [[0.0027004 ]
 [0.02446984]]
db = [0.00151471]
cost = [0.00295537]


2.	Modify `optimize()` function to implement the stochastic gradient descent (SGD) method and Adam optimization method using the numpy library. Apply them to solve the problem from p.1.

### Stochastic Gradient Descent

Stochastic gradient descent (SGD) performs a parameters (weights) update for each training example 
$(i)$ and label $y^{(i)}$:

$θ_t := θ_{t − 1} − η∇_θ J (θ^{( i )}_{t − 1})$,

where $J(θ)$ - objective (loss) function, $θ$ - model's parameters, $η$ - learning rate.

In [17]:
def SGD(w, b, X, Y, epochs, learning_rate=1e-1, print_cost=True):
    costs = []
    batch_size = 64   
    # batch_size = int(Y.shape[1] / 10)

    for epoch in range(epochs):
        for start in range(0, X.shape[1], batch_size):
            stop = start + batch_size
            
            X_batch = X[:, start:stop]
            y_batch = Y[:, start:stop]
        
            # Cost and gradient calculation 
            grads, cost = propagate(w,b,X_batch,y_batch)
            
            # Retrieve derivatives from grads
            dw = grads["dw"]
            db = grads["db"]
            
            # update rule
            w -= learning_rate*dw
            b -= learning_rate*db
            
            # Record the costs
            if epoch % 100 == 0:
                costs.append(cost)
            
            # Print the cost every 100 training iterations
            if print_cost and epochs % 100 == 0:
                print ("Cost after epoch %i: %f" %(epoch, cost))
    
    params = {
        "w": w,
        "b": b,
    }
    
    grads = {
        "dw": dw,
        "db": db,
    }
    
    return params, grads, costs

In [24]:
params, grads, costs = SGD(w, b, X, Y,
    epochs=5000,
    batch_size=1,
)

print("w = " + str(params["w"]))
print("b = " + str(params["b"]))
print("dw = " + str(grads["dw"]))
print("db = " + str(grads["db"]))

Cost after epoch 0: 0.000213
Cost after epoch 0: 0.000213
Cost after epoch 0: 0.000213
Cost after epoch 1: 0.000213
Cost after epoch 1: 0.000213
Cost after epoch 1: 0.000213
Cost after epoch 2: 0.000213
Cost after epoch 2: 0.000213
Cost after epoch 2: 0.000213
Cost after epoch 3: 0.000213
Cost after epoch 3: 0.000212
Cost after epoch 3: 0.000212
Cost after epoch 4: 0.000212
Cost after epoch 4: 0.000212
Cost after epoch 4: 0.000212
Cost after epoch 5: 0.000212
Cost after epoch 5: 0.000212
Cost after epoch 5: 0.000212
Cost after epoch 6: 0.000212
Cost after epoch 6: 0.000212
Cost after epoch 6: 0.000212
Cost after epoch 7: 0.000212
Cost after epoch 7: 0.000212
Cost after epoch 7: 0.000212
Cost after epoch 8: 0.000212
Cost after epoch 8: 0.000212
Cost after epoch 8: 0.000212
Cost after epoch 9: 0.000212
Cost after epoch 9: 0.000212
Cost after epoch 9: 0.000212
Cost after epoch 10: 0.000212
Cost after epoch 10: 0.000212
Cost after epoch 10: 0.000211
Cost after epoch 11: 0.000211
Cost after

### Adam
Adam (Adaptive momentum algorithm) combines momentum and RMSprop approaches:
- computes adaptive learning rates;
- storing an exponentially moving average of past squared gradients
- and uses scaling of the gradients:
$V^{corr}_t = \dfrac{V_t}{1 − β^t}$

Parameters:
- $η$ - learning rate 
- β_1 = 0.9
- β_2 = 0.999
- ϵ = 10−8

The moving averages of past and past squared gradients $V_t$ and $S_t$ respectively are computed as follows:

$V_t = β_1 V_{t − 1} + ( 1 − β_1 ) g_t \\ S_t = β_2 S_{t − 1} + ( 1 − β_2 ) g^2_t$

Adam update rule for the model weights:

$θ_t + 1 = θ_t − \dfrac{η}{\sqrt{S^{corr}_t}+ ϵ}  V_t$

> Ref: [Адам - ​​последние тенденции в глубокой оптимизации обучения.
](https://machinelearningmastery.ru/adam-latest-trends-in-deep-learning-optimization-6be9a291375c/)
>
> $m_t = β_1 m_{t − 1} + (1 − β_1) g_t \\ v_t = β_2 v_{t − 1} + ( 1 − β_2 ) g^2_t$
> The shift corrected the estimates for the first and second moments.
>
> $\hat{V_t}=\dfrac{V_t}{1-β_1^t} \\ \hat{S_t}=\dfrac{S_t}{1-β_2^t}$

In [39]:
def ADAM(w, b, X, Y, epochs, learning_rate=1e-1, print_cost=False, ksi=1e-8, beta_1=0.9, beta_2=0.999):
    # print(f"ADAM curr LR", learning_rate)
    costs = []
    V_dw, V_db, S_dw, S_db = 0, 0, 0, 0
    # Avoiding division by zero
    for epoch in range(1, epochs + 1):
        # Cost and gradient calculation 
        grads, cost = propagate(w, b, X, Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        # V_dw, V_db, S_dw, S_db = 0, 0, 0, 0
        
        # Formula from the 5 Lecture
        V_dw = beta_1 * V_dw + (1 - beta_1) * dw 
        V_db = beta_1 * V_db + (1 - beta_1) * db 
        
        S_dw = beta_2 * S_dw + (1 - beta_2) * np.power(dw, 2)
        S_db = beta_2 * S_db + (1 - beta_2) * np.power(db, 2)
        # + ksi also 4 Avoiding division by zero
        V_dw_hat = V_dw / (1 - (beta_1 ** epoch))
        V_db_hat = V_db / (1 - (beta_1 ** epoch))
        S_dw_hat = S_dw / (1 - (beta_2 ** epoch))
        S_db_hat = S_db / (1 - (beta_2 ** epoch))

        # Update rule from Lecture and MachineLearningMastery
        w -= learning_rate * (V_dw_hat / (np.sqrt(S_dw_hat) + ksi))
        b -= learning_rate * (V_db_hat / (np.sqrt(S_db_hat) + ksi))
 
        # Record the costs
        if epoch % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and epoch % 100 == 0:
            print ("Cost after iteration %i: %f" %(epoch, cost))
    
    params = {
        "w": w,
        "b": b,
    }
    
    grads = {
        "dw": dw,
        "db": db,
    }
    
    return params, grads, costs

In [12]:
params, grads, costs = ADAM(w, b, X, Y,
    epochs=5000,
)

print("w = " + str(params["w"]))
print("b = " + str(params["b"]))
print("dw = " + str(grads["dw"]))
print("db = " + str(grads["db"]))

Cost after iteration 100: 1.027078
Cost after iteration 200: 1.025659
Cost after iteration 300: 1.025285
Cost after iteration 400: 1.025122
Cost after iteration 500: 1.025033
Cost after iteration 600: 1.024979
Cost after iteration 700: 1.024944
Cost after iteration 800: 1.024918
Cost after iteration 900: 1.024900
Cost after iteration 1000: 1.024886
Cost after iteration 1100: 1.024876
Cost after iteration 1200: 1.024868
Cost after iteration 1300: 1.024861
Cost after iteration 1400: 1.024856
Cost after iteration 1500: 1.024851
Cost after iteration 1600: 1.024848
Cost after iteration 1700: 1.024845
Cost after iteration 1800: 1.024842
Cost after iteration 1900: 1.024840
Cost after iteration 2000: 1.024838
Cost after iteration 2100: 1.024837
Cost after iteration 2200: 1.024836
Cost after iteration 2300: 1.024834
Cost after iteration 2400: 1.024833
Cost after iteration 2500: 1.024833
Cost after iteration 2600: 1.024832
Cost after iteration 2700: 1.024831
Cost after iteration 2800: 1.024831
C

### 3.	For three modifications of gradient descent (GD, SGD and Adam) plot the learning curves (dependence of the value of the loss function on the iteration number), apply models with different values of the learning rate (at least 5 different learning rates). How does it affect the accuracy of the model? 

In [21]:
learning_rates = [0.001, 0.005, 0.01, 0.05, 0.1]

In [87]:
GD_trained = []

for currLR in learning_rates:
    print(f"Current learning rate {currLR}")
    gd_with_LR = model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        optimization_algorithm=optimize, 
        learning_rate=currLR, 
        print_cost=False,
    )
    GD_trained.append(gd_with_LR)

Current learning rate 0.001
train accuracy: 72.52044081052257 %
test accuracy: 72.70788912579957 %
Current learning rate 0.005
train accuracy: 77.10629221471739 %
test accuracy: 74.73347547974413 %
Current learning rate 0.01
train accuracy: 78.35051546391753 %
test accuracy: 75.5863539445629 %
Current learning rate 0.05
train accuracy: 82.26093138997511 %
test accuracy: 76.97228144989339 %
Current learning rate 0.1
train accuracy: 83.32740846071809 %
test accuracy: 76.43923240938166 %


In [19]:
SGD_trained = []

for currLR in learning_rates:
    print(f"Current learning rate {currLR}")
    sgd_with_LR = model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        optimization_algorithm=SGD, 
        learning_rate=currLR, 
        print_cost=False,
    )
    SGD_trained.append(sgd_with_LR)

Current learning rate 0.001
train accuracy: 81.86988979736935 %
test accuracy: 77.50533049040511 %
Current learning rate 0.005
train accuracy: 85.24706718805545 %
test accuracy: 75.90618336886993 %
Current learning rate 0.01
train accuracy: 86.84678279416993 %
test accuracy: 75.5863539445629 %
Current learning rate 0.05
train accuracy: 90.36615712762176 %
test accuracy: 75.47974413646055 %
Current learning rate 0.1
train accuracy: 91.7170280838962 %
test accuracy: 74.84008528784648 %


In [11]:
def AdamMa(w, b, X, Y, num_iterations, learning_rate, print_cost=False, beta1=0.9, beta2=0.999, constant=1e-8):
    costs = []
    v_dw, v_db = 0,0
    m_dw, m_db = 0,0
    
    for iteration in range(num_iterations):        
        
        grads, cost = propagate(w,b,X,Y)
        
        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]
        
        
        m_dw = beta1*m_dw + (1-beta1)*dw
        # *** biases *** #
        m_db = beta1*m_db + (1-beta1)*db

        # beta 2
        # weight
        v_dw = beta2*v_dw + (1-beta2)*np.power(dw, 2)
        # bias
        v_db = beta2*v_db + (1-beta2)*np.power(db, 2)

        # bias correction
        m_dw_corr = m_dw/(1-beta1**iteration+1)
        m_db_corr = m_db/(1-beta1**iteration+1)
        v_dw_corr = v_dw/(1-beta2**iteration+1)
        v_db_corr = v_db/(1-beta2**iteration+1)

        # update weights and biases
        w -=  learning_rate*(m_dw_corr/(np.sqrt(v_dw_corr)+constant))
        b -=  learning_rate*(m_db_corr/(np.sqrt(v_db_corr)+constant))
        
        # Record the costs
        if iteration % 100 == 0:
            costs.append(cost)
        
        # Print the cost every 100 training iterations
        if print_cost and iteration % 100 == 0:
            print ("Cost after iteration %i: %f" %(iteration, cost))
        
    params = {"w": w,
              "b": b}
    grads = {"dw": dw,
             "db": db}
    return params, grads, costs

In [14]:
ADAM_MA_trained = []

for currLR in learning_rates:
    print(f"Current learning rate {currLR}")
    adam_with_LR = model(
        X_train, 
        y_train, 
        X_test, 
        y_test, 
        optimization_algorithm=AdamMa, 
        num_iterations=2000, 
        learning_rate=1e-3, 
        print_cost=True
    )
    ADAM_MA_trained.append(adam_with_LR)

Current learning rate 0.001
Cost after iteration 0: 0.693147
Cost after iteration 100: 0.431998
Cost after iteration 200: 0.393599
Cost after iteration 300: 0.371449
Cost after iteration 400: 0.355642
Cost after iteration 500: 0.343234
Cost after iteration 600: 0.332953
Cost after iteration 700: 0.324134
Cost after iteration 800: 0.316384
Cost after iteration 900: 0.309455
Cost after iteration 1000: 0.303175
Cost after iteration 1100: 0.297422
Cost after iteration 1200: 0.292108
Cost after iteration 1300: 0.287164
Cost after iteration 1400: 0.282536
Cost after iteration 1500: 0.278182
Cost after iteration 1600: 0.274067
Cost after iteration 1700: 0.270165
Cost after iteration 1800: 0.266452
Cost after iteration 1900: 0.262908
train accuracy: 89.90401706363313 %
test accuracy: 76.11940298507463 %
Current learning rate 0.005
Cost after iteration 0: 0.693147
Cost after iteration 100: 0.431998
Cost after iteration 200: 0.393599
Cost after iteration 300: 0.371449
Cost after iteration 400: 0

In [40]:
ADAM_trained = []
# SGD         w, b, X, Y, epochs, learning_rate=1e-1, print_cost=True
# model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False, optimization_algorithm=optimize)
for currLR in learning_rates:
    print(f"Current learning rate {currLR}")
    adam_with_LR = model(
        X_train, 
        y_train, 
        X_test, 
        y_test,
        optimization_algorithm=ADAM, 
        learning_rate=currLR, 
        print_cost=False,
    )
    # print(adam_with_LR)
    ADAM_trained.append(adam_with_LR)

Current learning rate 0.001
train accuracy: 89.93956629932457 %
test accuracy: 75.5863539445629 %
Current learning rate 0.005
train accuracy: 94.02772840383932 %
test accuracy: 73.56076759061834 %
Current learning rate 0.01
train accuracy: 94.52541770351938 %
test accuracy: 72.49466950959489 %
Current learning rate 0.05
train accuracy: 95.34305012442232 %
test accuracy: 71.00213219616205 %
Current learning rate 0.1
train accuracy: 95.76964095271951 %
test accuracy: 70.8955223880597 %



divide by zero encountered in log


invalid value encountered in multiply



In [33]:
type(GD_trained[0]["costs"][0])

numpy.float64

In [42]:
type(SGD_trained[0]["costs"][0])#["costs"]

numpy.ndarray

In [44]:
# Conver slice of sigle numpy.ndarray to slice
list_SGD_costs = []
for idx in range(len(learning_rates)):
    cst = []
    for elem in SGD_trained[idx]["costs"]:
        cst.append(elem[0])
    list_SGD_costs.append(cst)

Converetd slices size 5, 5


In [41]:
list_ADAM_costs = []
for idx in range(len(learning_rates)):
    cst = []
    for elem in ADAM_trained[idx]["costs"]:
        cst.append(elem[0])
    list_ADAM_costs.append(cst)

In [94]:
fig = go.Figure()
for idx, currLR in enumerate(learning_rates):
    fig.add_trace(
        go.Line(
            y=GD_trained[idx]["costs"],
            name=currLR,
        ), 
    )

fig.update_layout(title_text="Gradient Descent", legend_title="Learning rate")
fig.update_xaxes(title="Iteration")
fig.update_yaxes(title="Loss")
fig.show()

In [95]:
fig = go.Figure()
for idx, currLR in enumerate(learning_rates):
    fig.add_trace(
        go.Line(
            y=list_SGD_costs[idx],
            name=currLR,
        ), 
    )

fig.update_layout(title_text="Stochastic Gradient Descent", legend_title="Learning rate")
fig.update_xaxes(title="Epoch")
fig.update_yaxes(title="Loss")

fig.show()

In [42]:
fig = go.Figure()
for idx, currLR in enumerate(learning_rates):
    fig.add_trace(
        go.Line(
            y=list_ADAM_costs[idx],
            name=currLR,
        ), 
    )

fig.update_layout(title_text="ADAM", legend_title="Learning rate")
fig.update_xaxes(title="Epoch")
fig.update_yaxes(title="Loss")
fig.show()

### 4.	Compare the accuracy of the models fitted with various BP algorithms.

All of three optimization algorithms `Gradient Descent`, `Stochastic Gradient Descent` and  `Adam` showed their best performance depending on current learning rate.

As it shown in the tables with the inctrase of step of LR `Adam` drops accuracy on test, however `GD` is totally opposite.

Also, `Stochastic Gradient Descent` has a noisy Loss output, so why we cannot be certain what is the optimal value of learning rate for such an algorithm, but overall it has the best accuracy with smaller LR. But it's stable

**Table 1. Algorithms accuracy with the LR = 0.001**
|Algorithm | train | test |
|:------:|:------:|:------:|
|GD      | 72.52044081052257 %| 72.70788912579957 %|
|SGD     |81.86988979736935 %|**77.50533049040511 %**|
|ADAM    |**89.93956629932457 %**|75.5863539445629 %|

**Table 2. Algorithms accuracy with the LR = 0.005**
|Algorithm | train | test |
|:------:|:------:|:------:|
|GD      | 77.10629221471739  %| 74.73347547974413 %|
|SGD     |85.24706718805545 %|**75.90618336886993 %**|
|ADAM    |**94.02772840383932 %**|73.56076759061834 %|

**Table 3. Algorithms accuracy with the LR = 0.01**
|Algorithm | train | test |
|:------:|:------:|:------:|
|GD      | 78.35051546391753 %|**75.5863539445629 %**|
|SGD     | 86.84678279416993 %|**75.5863539445629 %**|
|ADAM    | **94.52541770351938 %**|72.49466950959489 %|

**Table 4. Algorithms accuracy with the LR = 0.05**
|Algorithm | train | test |
|:------:|:------:|:------:|
|GD      | 82.26093138997511 %| **76.97228144989339 %**|
|SGD     | 90.36615712762176 %| 75.47974413646055 %|
|ADAM    | **95.34305012442232 %**|71.00213219616205 %|

**Table 5. Algorithms accuracy with the LR = 0.1**
|Algorithm | train | test |
|:------:|:------:|:------:|
|GD      | 83.32740846071809 %| **76.43923240938166 %**|
|SGD     |91.7170280838962 %|74.84008528784648 %|
|ADAM    |**95.76964095271951 %**|70.8955223880597 %|