In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [11]:
class MLayerNN(nn.Module):
    def __init__(self, in_dim, out_dim, depth, N_params):
        super(MLayerNN, self).__init__()
        self.test_loss = []
        self.train_loss = []
        # Initialize layer sizes list
        layer_dims = [in_dim]  # First layer is input dim
        h = int(N_params / ((in_dim + out_dim) * (depth)))
        while (in_dim * h + (depth-3) * h**2 + h * out_dim) > N_params:
            h -= 1
        #h = 4096 ### Number params hidden layer
        layer_dims += [int(h)] * (depth - 2)  # Set all hidden layers to hidden_dim
        layer_dims.append(out_dim)  # Last layer is output dim
        print(layer_dims)
        self.layers = nn.ModuleList([
            nn.Linear(layer_dims[i-1], layer_dims[i]) for i in range(1,depth)
        ])
        self.params = sum(p.numel() for p in self.parameters() if p.ndimension() > 1)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        for layer in self.layers[:-1]:
            x = self.activation(layer(x))  # Apply ReLU activation
        return self.layers[-1](x)  # Final layer without activation

In [12]:
#### FRIEDMANN 1
from sklearn.datasets import make_friedman1
def get_loader(in_dim, noise, n_samples = 20000):
    # Set the seed for reproducibility
    seed = 42
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Generate the Friedmann dataset
    X_train, y = make_friedman1(n_samples=int(n_samples * 0.8), n_features= in_dim, random_state=seed, noise=noise)
    y_train = np.expand_dims(y, axis=1)
    # Split into train and test sets (80% train, 20% test)
    X_test, y = make_friedman1(n_samples=int(n_samples * 0.2), n_features= in_dim, random_state=seed, noise=0.0)
    y_test = np.expand_dims(y, axis=1)
    # Convert numpy arrays to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    # Create TensorDataset for train and test sets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # Set batch size and create DataLoader for training and testing
    batch_size = 1024
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    return train_loader, test_loader



In [13]:
@torch.no_grad()
def compute_test_loss(test_loader, model):
    criterion = torch.nn.MSELoss()
    running_loss = 0.
    for batch, target in test_loader:
        outputs = model(batch)
        loss = criterion(target, outputs)
        running_loss += loss.item()
    return running_loss / len(test_loader)

In [18]:
model = MLayerNN(5, 1, depth = 3 , N_params = 3*75000)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(trainable_params)

[5, 12500, 1]
87501


In [42]:
in_dim = 5
model = model = MLayerNN(5, 1, depth = 3 , N_params = 3*75000)  # replace with KAN_NN_fast.Neural_Kan(...) or any model
model.train()

inputs = torch.randn(32, in_dim)  # adjust input size as needed
targets = torch.randn(32, 1)      # adjust target shape as needed
criterion = nn.MSELoss()
optimizer = torch.optim.RAdam(model.parameters(), lr=0.001)

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU],  # or add CUDA if using GPU
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Print top 20 most expensive ops (including backward)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20))

[5, 12500, 1]
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Optimizer.step#RAdam.step        18.39%       1.225ms        44.35%       2.954ms       2.954ms     683.62 Kb    -683.60 Kb             1  
    autograd::engine::evaluate_function: AddmmBackward0         1.13%      75.000us        19.92%       1.327ms     663.500us     341.68 Kb      -1.53 Mb             2  
     autograd::engine::evaluate_function: ReluBackward0         0.26%      17.000us        19.59%       1.305ms       1.305ms      -1.53

In [45]:
import time
import dill
n_samples = 20000
in_dims = [5,100]
noises = [1.]
epochs = 1000
train_losses = []
test_losses = []
adjust = True
for j,in_dim in enumerate(in_dims):
    for noise in noises:
        param = 1
        train_loader, test_loader = get_loader(in_dim, noise,n_samples)
        model = MLayerNN(in_dim, 1, depth = 3 , N_params = 3*75000)
        model.train()
        optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
        criterion = torch.nn.MSELoss()
        for epoch in range(epochs):
            running_loss = 0.0 
            for batch, target in train_loader:
                start_time = time.time()
                optimizer.zero_grad()
                outputs = model(batch)
                loss = criterion(target, outputs)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            avg_loss = running_loss / len(train_loader)
            model.train_loss.append(avg_loss)
            test_l = compute_test_loss(test_loader, model)
            model.test_loss.append(test_l)
            print(f"Epoch [{epoch + 1}/{epochs}], Loss: {avg_loss:.6f}, test: {test_l:.6f}, lr: {optimizer.param_groups[0]['lr']:6f}")
        plt.plot(model.train_loss[-50:])
        plt.title(f'train_loss')
        plt.legend()
        plt.show()
        print("Training Complete!")
        #with open(f"models/NN_2048{noise}_{in_dim}.dill", "wb") as f:
        #    dill.dump(model, f)

[5, 12500, 1]
Epoch [1/1000], Loss: 60.983673, test: 29.283024, lr: 0.001000
Epoch [2/1000], Loss: 16.265793, test: 12.011430, lr: 0.001000
Epoch [3/1000], Loss: 9.712362, test: 7.008840, lr: 0.001000
Epoch [4/1000], Loss: 7.390205, test: 5.596724, lr: 0.001000
Epoch [5/1000], Loss: 6.452611, test: 5.007449, lr: 0.001000
Epoch [6/1000], Loss: 5.876878, test: 4.392658, lr: 0.001000
Epoch [7/1000], Loss: 5.277183, test: 3.783074, lr: 0.001000
Epoch [8/1000], Loss: 4.663867, test: 3.162368, lr: 0.001000
Epoch [9/1000], Loss: 4.032223, test: 2.541576, lr: 0.001000
Epoch [10/1000], Loss: 3.383700, test: 1.955919, lr: 0.001000
Epoch [11/1000], Loss: 2.806610, test: 1.456875, lr: 0.001000
Epoch [12/1000], Loss: 2.349221, test: 1.112297, lr: 0.001000
Epoch [13/1000], Loss: 2.046224, test: 0.893815, lr: 0.001000
Epoch [14/1000], Loss: 1.851869, test: 0.739606, lr: 0.001000
Epoch [15/1000], Loss: 1.706030, test: 0.623170, lr: 0.001000
Epoch [16/1000], Loss: 1.576598, test: 0.512677, lr: 0.001000

KeyboardInterrupt: 