In [1]:
import tensorflow as tf

# 1] SGD-Stochastic Gradient Descent

**Use Case:**
- **Advantages:**
  - Simplifies the training process by updating model parameters using the gradients of the loss function with respect to the entire training dataset or a batch of data points.
  - Effective for convex and smooth optimization problems, where the objective is to minimize the loss function by adjusting model parameters iteratively.
  - Provides control over batch size and learning rate, allowing for fine-grained adjustments in optimization performance.

**When to Use:**
- Use SGD when dealing with:
  - **Large Datasets**: By processing batches of data sequentially, SGD efficiently handles large datasets by updating model parameters iteratively.
  - **Convex Optimization**: Optimizing convex loss functions where the objective is to find a global minimum using gradient descent methods.
  - **Fine-tuning**: Adjusting learning rate and batch size parameters allows for fine-tuning optimization performance based on computational resources and dataset characteristics.

In [8]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> SGD
optimizer=tf.keras.optimizers.SGD(learning_rate=0.01)

#training phase
@tf.function
def train_step_sgd(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [9]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_sgd(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.4367831349372864
Epoch 1000, Loss: 0.26773083209991455
Epoch 2000, Loss: 0.24346798658370972
Epoch 3000, Loss: 0.2361685037612915
Epoch 4000, Loss: 0.22949662804603577
Epoch 5000, Loss: 0.2232733517885208
Epoch 6000, Loss: 0.2173878252506256
Epoch 7000, Loss: 0.21178199350833893
Epoch 8000, Loss: 0.20645004510879517
Epoch 9000, Loss: 0.2014194130897522


# 2] RMSProp (Root Mean Propagation)

**Use Case:**
- **Advantages:**
  - Effective for training recurrent neural networks (RNNs) and other sequences where gradients can vary significantly.
  - Adaptively scales the learning rate based on the magnitude of recent gradients for each parameter, which can accelerate convergence in non-convex optimization settings.
  - Handles sparse gradients well because of its adaptive learning rate mechanism.

**When to Use:**
- Use RMSprop when dealing with:
  - **Recurrent Neural Networks (RNNs)**: Helps stabilize and accelerate training due to varying gradients over time.
  - **Non-convex Optimization**: Adjusts learning rates individually for each parameter based on their recent gradient history, potentially leading to faster convergence.

In [10]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> RMSProp
optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.01)

#training phase
@tf.function
def train_step_rms(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [11]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_rms(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.4640316963195801
Epoch 1000, Loss: 0.09084735810756683
Epoch 2000, Loss: 0.0029399986378848553
Epoch 3000, Loss: 0.00020975276129320264
Epoch 4000, Loss: 9.567279630573466e-05
Epoch 5000, Loss: 6.094207492424175e-05
Epoch 6000, Loss: 4.440503835212439e-05
Epoch 7000, Loss: 3.479731094557792e-05
Epoch 8000, Loss: 2.854119702533353e-05
Epoch 9000, Loss: 2.415350900264457e-05


# 3] Adagrad (Adaptive Gradient Algorithm)

**Use Case:**
- **Advantages:**
  - Well-suited for sparse data or problems with features that rarely occur, as it adapts the learning rate based on the cumulative historical squared gradients for each parameter.
  - Automatically reduces the learning rate for parameters that have large gradients, which can lead to improved convergence especially in settings with sparse data.

**When to Use:**
- Use Adagrad when dealing with:
  - **Sparse Data**: Features that are rare or have infrequent occurrences, where adaptive learning rates based on historical gradients can be beneficial.
  - **Feature Selection**: Automatically adjusts learning rates based on the data's characteristics, potentially reducing the need for manual tuning.


In [12]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> Adagrad
optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.01)

#training phase
@tf.function
def train_step_adagrad(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [13]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_adagrad(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.30742931365966797
Epoch 1000, Loss: 0.25299394130706787
Epoch 2000, Loss: 0.2415032535791397
Epoch 3000, Loss: 0.23256665468215942
Epoch 4000, Loss: 0.22474023699760437
Epoch 5000, Loss: 0.21754717826843262
Epoch 6000, Loss: 0.21082176268100739
Epoch 7000, Loss: 0.20454193651676178
Epoch 8000, Loss: 0.1987500786781311
Epoch 9000, Loss: 0.19349640607833862


# 4] Adam

**Use Case:**
- **Advantages:**
  - Combines the benefits of two other popular optimizers, RMSprop and AdaGrad, by using adaptive learning rates for each parameter.
  - Well-suited for a wide range of optimization problems in deep learning due to its adaptive nature and efficient convergence properties.
  - Handles sparse gradients effectively, making it suitable for problems with noisy data or complex models.
  - Automatically adjusts learning rates during training, which can accelerate convergence and improve model performance.

**When to Use:**
- Use Adam optimizer when dealing with:
  - **Deep Learning Models**: Particularly effective for training deep neural networks across various architectures (CNNs, RNNs, etc.).
  - **Large-Scale Datasets**: Efficiently handles large datasets by dynamically adapting learning rates based on gradient statistics.
  - **Complex Optimization Landscapes**: Benefits from adaptive learning rates and momentum, making it suitable for non-convex optimization problems where finding an optimal solution is challenging.

In [14]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> Adam
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)

#training phase
@tf.function
def train_step_adam(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [15]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_adam(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.2594316005706787
Epoch 1000, Loss: 0.11361584067344666
Epoch 2000, Loss: 0.004071903880685568
Epoch 3000, Loss: 0.0009629479027353227
Epoch 4000, Loss: 0.0003769979521166533
Epoch 5000, Loss: 0.00017743752687238157
Epoch 6000, Loss: 9.130692342296243e-05
Epoch 7000, Loss: 4.93674524477683e-05
Epoch 8000, Loss: 2.749638952082023e-05
Epoch 9000, Loss: 1.5607356544933282e-05


# 5] N-Adam

**Use Case:**
- **Advantages:**
  - Integrates Nesterov's accelerated gradient (NAG) into Adam optimizer, combining the benefits of both Nesterov momentum and adaptive learning rates.
  - Converges faster than traditional Adam and exhibits better performance on many deep learning tasks.
  - Robust to noisy gradients and sparse data, similar to Adam, due to its adaptive learning rate mechanism.

**When to Use:**
- Use Nadam optimizer when:
  - **Deep Learning Models**: Suitable for a wide range of deep learning architectures, including CNNs, RNNs, and transformers.
  - **Fast Convergence**: Particularly effective for models requiring fast convergence and stable optimization, especially in scenarios with large-scale datasets.
  - **Regularization**: Works well with regularization techniques like dropout and weight decay, enhancing generalization and preventing overfitting.


In [18]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> Nadam
optimizer=tf.keras.optimizers.Nadam(learning_rate=0.01)

#training phase
@tf.function
def train_step_nadam(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [19]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_nadam(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.33062759041786194
Epoch 1000, Loss: 0.1296459436416626
Epoch 2000, Loss: 0.12607485055923462
Epoch 3000, Loss: 0.1254422962665558
Epoch 4000, Loss: 0.12521952390670776
Epoch 5000, Loss: 0.12511873245239258
Epoch 6000, Loss: 0.12506718933582306
Epoch 7000, Loss: 0.1250390261411667
Epoch 8000, Loss: 0.1250230073928833
Epoch 9000, Loss: 0.12501367926597595


# 6] Adamax

**Use Case:**
- **Advantages:**
  - Extension of Adam that introduces an alternative approach to compute the adaptive learning rates based on the infinity norm (maximum norm) of the gradients.
  - Particularly effective for models with parameters that exhibit large variance in gradients, as it provides a more stable update rule.
  - Maintains the benefits of Adam in terms of adaptive learning rates and momentum while simplifying the computation compared to Adam.

**When to Use:**
- Use Adamax optimizer when:
  - **Dealing with Large Models**: Especially useful for models with parameters that vary significantly in their gradient magnitudes.
  - **Natural Language Processing (NLP)**: Well-suited for training models like recurrent neural networks (RNNs) and transformers in NLP tasks due to their large parameter space and varied gradients.
  - **Sparse Data**: Efficiently handles sparse gradients, making it suitable for models trained on data with irregular or missing information.

In [20]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> Adamax
optimizer=tf.keras.optimizers.Adamax(learning_rate=0.01)

#training phase
@tf.function
def train_step_adamax(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [21]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_adamax(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.3066166341304779
Epoch 1000, Loss: 0.1298677623271942
Epoch 2000, Loss: 0.05104735121130943
Epoch 3000, Loss: 0.003056046087294817
Epoch 4000, Loss: 0.0005572356749325991
Epoch 5000, Loss: 0.0001495684264227748
Epoch 6000, Loss: 4.6029505028855056e-05
Epoch 7000, Loss: 1.5061059457366355e-05
Epoch 8000, Loss: 5.09380743096699e-06
Epoch 9000, Loss: 1.7624225847612252e-06


# 7] Adadelta

**Use Case:**
- **Advantages:**
  - Adaptively adjusts learning rates based on a moving average of gradient updates without the need for a manually specified global learning rate.
  - Particularly effective in scenarios where setting a fixed learning rate is challenging or impractical.
  - Robust to noisy gradients and sparse data due to its adaptive learning rate mechanism.

**When to Use:**
- Use Adadelta optimizer when:
  - **Noisy or Sparse Data**: Handles data with irregularities or missing information effectively by dynamically adjusting learning rates.
  - **Complex Optimization Landscapes**: Well-suited for non-convex optimization problems where the gradient magnitudes vary widely across parameters.
  - **Long-Term Dependencies**: Effective for recurrent neural networks (RNNs) and other models with long-term dependencies, as it mitigates the vanishing/exploding gradient problem.

In [23]:
# Independent var
x_data=tf.constant([
    [0.0,0.0],
    [0.0,0.1],
    [1.0,0.0],
    [1.0,1.0]
],dtype=tf.float32)

# Dependent var
y_data=tf.constant([
    [0.0],
    [1.0],
    [1.0],
    [0.0]
],dtype=tf.float32)


#params
input_size=2  # two inputs i.e 0.0 and i.0
hidden_size=3 #hidden layer has 3neurons
output_size=1  #since it is classification o/p is either 1.0 r 0.0


#random normalized initialization of weights and biases
W1=tf.Variable(tf.random.normal([input_size,hidden_size]))
b1=tf.Variable(tf.random.normal([hidden_size]))
W2=tf.Variable(tf.random.normal([hidden_size,output_size]))
b2=tf.Variable(tf.random.normal([output_size]))


#forward pass
@tf.function
def forward_pass(x):
    z1=tf.matmul(x,W1)+b1
    #using sigmoid
    a1=tf.nn.sigmoid(z1)
    z2=tf.matmul(a1,W2)+b2
    a2=tf.nn.sigmoid(z2)
    
    return a2
    

#loss function
@tf.function
def compute_loss(y_true,y_pred):
    #MSE
    return tf.reduce_mean(tf.square(y_true-y_pred))

#optimizer --> Adadelta
optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.01)

#training phase
@tf.function
def train_step_adadelta(x,y):
    with tf.GradientTape() as tape:
        #forward pass
        y_pred=forward_pass(x)
        #loss
        loss=compute_loss(y,y_pred)
    gradients=tape.gradient(loss,[W1,b1,W2,b2])
    
    optimizer.apply_gradients(zip(gradients,[W1,b1,W2,b2]))
    
    return loss

In [24]:
# Training parameters
epochs = 10000

# Training loop
for epoch in range(epochs):
    loss = train_step_adadelta(x_data, y_data)
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss.numpy()}")

Epoch 0, Loss: 0.4338855445384979
Epoch 1000, Loss: 0.42960119247436523
Epoch 2000, Loss: 0.4228862524032593
Epoch 3000, Loss: 0.4137146472930908
Epoch 4000, Loss: 0.40178748965263367
Epoch 5000, Loss: 0.38677048683166504
Epoch 6000, Loss: 0.3685241937637329
Epoch 7000, Loss: 0.3474231958389282
Epoch 8000, Loss: 0.3246522545814514
Epoch 9000, Loss: 0.3022427260875702
