In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict

class BitAdder(nn.Module):
    def __init__(self):
        super(BitAdder, self).__init__()
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(2, 4)),  # Input layer to hidden layer
            ('relu1', nn.ReLU()),      # Activation function
            ('fc2', nn.Linear(4, 2)),  # Hidden layer to output layer
            ('sigmoid', nn.Sigmoid())  # Sigmoid activation for binary output
        ]))
    
    def forward(self, x):
        return self.model(x)

class BitAdderV2(nn.Module):
    def __init__(self):
        super(BitAdderV2, self).__init__()
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(2, 6)),  # Input layer to larger hidden layer
            ('Lrelu', nn.LeakyReLU(0.01)),  # Activation function
            ('fc2', nn.Linear(6, 3)),  # Additional hidden layer
            ('relu', nn.ReLU()),      # Activation function
            ('fc3', nn.Linear(3, 2)),  # Output layer
            ('sigmoid', nn.Sigmoid())  # Sigmoid activation for binary output
        ]))
    
    def forward(self, x):
        return self.model(x)

class BitAdderV3(nn.Module):
    def __init__(self):
        super(BitAdderV3, self).__init__()
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(2, 4)),
            ('tanh', nn.Tanh()),
            ('fc2', nn.Linear(4, 6)),
            ('relu', nn.ReLU()),
            ('fc3', nn.Linear(6, 2)),
            ('sigmoid', nn.Sigmoid())
        ]))
    
    def forward(self, x):
        return self.model(x)

# Initialize models
model1 = BitAdder()
model2 = BitAdderV2()
model3 = BitAdderV3()

print("Model 1:")
print(model1)
print("\nModel 2:")
print(model2)
print("\nModel 3:")
print(model3)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer1 = optim.Adam(model1.parameters(), lr=0.01)
optimizer2 = optim.Adam(model2.parameters(), lr=0.01)
optimizer3 = optim.Adam(model3.parameters(), lr=0.01)

# Training data (all possible inputs and their sum as binary output)
data = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
labels = torch.tensor([[0, 0], [0, 1], [0, 1], [1, 0]], dtype=torch.float32)

# Train models
def train_model(model, optimizer, name):
    epochs = 1000
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if epoch % 100 == 0:
            print(f'{name} - Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}')

train_model(model1, optimizer1, "Model 1")
print()
train_model(model2, optimizer2, "Model 2")
print()
train_model(model3, optimizer3, "Model 3")

# Display model weights
def print_weights(model, name):
    print(f"\n{name} Weights:")
    for name, param in model.named_parameters():
        print(f'{name}: {param.data}')

print_weights(model1, "Model 1")
print_weights(model2, "Model 2")
print_weights(model3, "Model 3")


Model 1:
BitAdder(
  (model): Sequential(
    (fc1): Linear(in_features=2, out_features=4, bias=True)
    (relu1): ReLU()
    (fc2): Linear(in_features=4, out_features=2, bias=True)
    (sigmoid): Sigmoid()
  )
)

Model 2:
BitAdderV2(
  (model): Sequential(
    (fc1): Linear(in_features=2, out_features=6, bias=True)
    (Lrelu): LeakyReLU(negative_slope=0.01)
    (fc2): Linear(in_features=6, out_features=3, bias=True)
    (relu): ReLU()
    (fc3): Linear(in_features=3, out_features=2, bias=True)
    (sigmoid): Sigmoid()
  )
)

Model 3:
BitAdderV3(
  (model): Sequential(
    (fc1): Linear(in_features=2, out_features=4, bias=True)
    (tanh): Tanh()
    (fc2): Linear(in_features=4, out_features=6, bias=True)
    (relu): ReLU()
    (fc3): Linear(in_features=6, out_features=2, bias=True)
    (sigmoid): Sigmoid()
  )
)
Model 1 - Epoch [0/1000], Loss: 0.2722
Model 1 - Epoch [100/1000], Loss: 0.1894
Model 1 - Epoch [200/1000], Loss: 0.1594
Model 1 - Epoch [300/1000], Loss: 0.1412
Model 1 - Ep

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict

class ActivationBitAdder(nn.Module):
    def __init__(self, activation='relu'):
        super(ActivationBitAdder, self).__init__()
        
        # Choose activation function
        if activation == 'relu':
            act_func = nn.ReLU()
        elif activation == 'tanh':
            act_func = nn.Tanh()
        elif activation == 'leaky_relu':
            act_func = nn.LeakyReLU(0.01)
        elif activation == 'elu':
            act_func = nn.ELU(alpha=1.0)
        elif activation == 'selu':
            act_func = nn.SELU()
        elif activation == 'hardswish':
            act_func = nn.Hardswish()
        elif activation == 'mish':
            act_func = nn.Mish()
        else:
            raise ValueError(f"Unsupported activation function: {activation}")
        
        # Model architecture with chosen activation
        self.model = nn.Sequential(OrderedDict([
            ('fc1', nn.Linear(2, 6)),  
            ('activation1', act_func),      
            ('fc2', nn.Linear(6, 2)),  
            ('sigmoid', nn.Sigmoid())  # Keep sigmoid for binary output
        ]))
    
    def forward(self, x):
        return self.model(x)

def train_and_evaluate(activation):
    print(f"\n--- Testing {activation.upper()} Activation ---")
    
    
    data = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
    labels = torch.tensor([[0, 0], [0, 1], [0, 1], [1, 0]], dtype=torch.float32)

    model = ActivationBitAdder(activation)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    # Training loop
    for epoch in range(500):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if epoch % 100 == 0:
            print(f'Epoch [{epoch}/500], Loss: {loss.item():.4f}')

    with torch.no_grad():
        predictions = model(data)
        print("\nPredictions:")
        for inp, pred, exp in zip(data, predictions, labels):
            print(f"{inp.numpy()} | Pred: {pred.numpy()} | Expected: {exp.numpy()}")

activations = ['relu', 'tanh', 'leaky_relu', 'elu', 'selu', 'hardswish', 'mish']
for act in activations:
    train_and_evaluate(act)


--- Testing RELU Activation ---
Epoch [0/500], Loss: 0.2463
Epoch [100/500], Loss: 0.1431
Epoch [200/500], Loss: 0.1285
Epoch [300/500], Loss: 0.1262
Epoch [400/500], Loss: 0.1257

Predictions:
[0. 0.] | Pred: [0.01802737 0.0241109 ] | Expected: [0. 0.]
[0. 1.] | Pred: [0.49664798 0.5003635 ] | Expected: [0. 1.]
[1. 0.] | Pred: [0.03156736 0.97221845] | Expected: [0. 1.]
[1. 1.] | Pred: [0.4966451 0.5003668] | Expected: [1. 0.]

--- Testing TANH Activation ---
Epoch [0/500], Loss: 0.2618
Epoch [100/500], Loss: 0.0804
Epoch [200/500], Loss: 0.0077
Epoch [300/500], Loss: 0.0027
Epoch [400/500], Loss: 0.0015

Predictions:
[0. 0.] | Pred: [0.02230498 0.02544831] | Expected: [0. 0.]
[0. 1.] | Pred: [0.028559 0.977457] | Expected: [0. 1.]
[1. 0.] | Pred: [0.03812669 0.9649823 ] | Expected: [0. 1.]
[1. 1.] | Pred: [0.9606348  0.02700343] | Expected: [1. 0.]

--- Testing LEAKY_RELU Activation ---
Epoch [0/500], Loss: 0.2798
Epoch [100/500], Loss: 0.0616
Epoch [200/500], Loss: 0.0074
Epoch [30

In [8]:
#Best Example(not to show)
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class AdvancedBitAdder(nn.Module):
    def __init__(self):
        super(AdvancedBitAdder, self).__init__()
        
        # More sophisticated architecture with residual connections
        self.model = nn.Sequential(
            nn.Linear(2, 8),  # Expanded input layer
            nn.BatchNorm1d(8),  # Batch normalization for stability
            nn.ReLU(),
            
            # Residual block
            nn.Sequential(
                nn.Linear(8, 8),
                nn.ReLU(),
                nn.Linear(8, 8),
            ),
            
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 2),
            nn.Sigmoid()
        )
        
        # Xavier initialization for better weight distribution
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
    
    def forward(self, x):
        return self.model(x)

# Create custom learning rate scheduler
class CosineAnnealingWarmRestarts(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1):
        self.T_0 = T_0
        self.T_mult = T_mult
        self.eta_min = eta_min
        self.current_period = T_0
        self.next_restart = T_0
        super().__init__(optimizer, last_epoch)
    
    def get_lr(self):
        return [self.eta_min + (base_lr - self.eta_min) * 
                (1 + np.cos(np.pi * self.last_epoch / self.current_period)) / 2
                for base_lr in self.base_lrs]
    
    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        
        if epoch >= self.next_restart:
            self.current_period *= self.T_mult
            self.next_restart += self.current_period
        
        super().step(epoch)

# Prepare training data
data = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
labels = torch.tensor([[0, 0], [0, 1], [0, 1], [1, 0]], dtype=torch.float32)

# Initialize model and optimizer
model = AdvancedBitAdder()
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-5)
criterion = nn.MSELoss()  # Mean Squared Error instead of Cross Entropy

# Custom learning rate scheduler
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=100, T_mult=2, eta_min=0.001)

# Training loop with more advanced techniques
epochs = 1500
best_loss = float('inf')
early_stop_counter = 0

for epoch in range(epochs):
    # Zero gradients
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(data)
    
    # Compute loss
    loss = criterion(outputs, labels)
    
    # Backward pass with gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    loss.backward()
    
    # Optimize
    optimizer.step()
    scheduler.step(epoch)
    
    # Early stopping mechanism
    if loss.item() < best_loss:
        best_loss = loss.item()
        early_stop_counter = 0
        # Optional: Save best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
    
    # Print progress
    if epoch % 100 == 0:
        print( f'Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}, '
              f'Learning Rate: {scheduler.get_lr()[0]:.6f}')
    
    # Early stopping if no improvement
    if early_stop_counter > 300:
        print("Early stopping triggered")
        break

# Test the model
print("\nModel Predictions:")
with torch.no_grad():
    predictions = model(data)
    print("Inputs | Predictions | Expected")
    for inp, pred, exp in zip(data, predictions, labels):
        print(f"{inp.numpy()} | {pred.numpy()} | {exp.numpy()}")

Epoch [0/1500], Loss: 0.2518, Learning Rate: 0.010000
Epoch [100/1500], Loss: 0.0004, Learning Rate: 0.005500
Epoch [200/1500], Loss: 0.0002, Learning Rate: 0.001000
Epoch [300/1500], Loss: 0.0001, Learning Rate: 0.002318
Epoch [400/1500], Loss: 0.0001, Learning Rate: 0.001000
Epoch [500/1500], Loss: 0.0001, Learning Rate: 0.002318
Epoch [600/1500], Loss: 0.0000, Learning Rate: 0.005500
Epoch [700/1500], Loss: 0.0000, Learning Rate: 0.001343
Epoch [800/1500], Loss: 0.0000, Learning Rate: 0.001000
Epoch [900/1500], Loss: 0.0000, Learning Rate: 0.001343
Epoch [1000/1500], Loss: 0.0000, Learning Rate: 0.002318
Epoch [1100/1500], Loss: 0.0000, Learning Rate: 0.003778
Epoch [1200/1500], Loss: 0.0000, Learning Rate: 0.005500
Epoch [1300/1500], Loss: 0.0000, Learning Rate: 0.007222
Epoch [1400/1500], Loss: 0.0000, Learning Rate: 0.008682

Model Predictions:
Inputs | Predictions | Expected
[0. 0.] | [0.00192856 0.00213916] | [0. 0.]
[0. 1.] | [2.5185346e-04 9.9849236e-01] | [0. 1.]
[1. 0.] | [