In [None]:
# The benefits of exploring randomly the parameter space of a model

## Perceptron Parameter Search Space

We will explore the search space of parameters of a simple perceptron and plot the loss against the perceptron's weight and bias.

We will create a simple dataset, define a perceptron model, compute the loss for different parameter values, and visualize the results.

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time

# Create a simple dataset
torch.manual_seed(0)
X = torch.rand(100, 1)  # 100 data points with a single feature
y = 3 * X + 1 + 0.1 * torch.randn(100, 1)  # Linear relationship with some noise
y = torch.div(y - y.min(), y.max() - y.min())  # Bound y to 0 and 1

# Define a simple perceptron model
class SimplePerceptron(nn.Module):
    def __init__(self):
        super(SimplePerceptron, self).__init__()
        self.linear = nn.Linear(1, 1)
        
    def forward(self, x):
        return self.linear(x)

# Mean squared error loss function
criterion = nn.MSELoss()

# Uniform exploration of parameter space
def uniform_exploration(weights, biases):
    start_time = time.time()
    loss_values = []

    for w in weights:
        for b in biases:
            model = SimplePerceptron()
            model.linear.weight.data.fill_(w)
            model.linear.bias.data.fill_(b)
            loss = criterion(model(X), y)
            loss_values.append([w, b, loss.item()])

    loss_values = np.array(loss_values)
    min_loss = np.min(loss_values[:, 2])
    idx_min_loss = np.argmin(loss_values[:, 2])
    best_w, best_b = loss_values[idx_min_loss, 0], loss_values[idx_min_loss, 1]
    execution_time = time.time() - start_time

    return loss_values, best_w, best_b, min_loss, execution_time

# Gradient descent optimization
def gradient_descent(model, optimizer, n_epochs):
    start_time = time.time()
    losses = []

    for epoch in range(n_epochs):
        outputs = model(X)
        loss = criterion(outputs, y)
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    execution_time = time.time() - start_time
    return model, losses, execution_time

# Define parameter ranges
weights = np.linspace(0, 1, 100)
biases = np.linspace(0, 1, 100)

# Uniform exploration
loss_values, best_w, best_b, min_loss, uniform_time = uniform_exploration(weights, biases)

# Plot the 3D search space
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(loss_values[:, 0], loss_values[:, 1], loss_values[:, 2], c=loss_values[:, 2], cmap='viridis')
ax.set_xlabel('Weight (w1)')
ax.set_ylabel('Bias (x2)')
ax.set_zlabel('Loss')
ax.set_title('Loss Landscape of a Simple Perceptron')
plt.show()

print(f'Uniform Exploration - Best weight: {best_w}, Best bias: {best_b}, Minimum loss: {min_loss}, Time: {uniform_time:.4f} seconds')

# Best model from uniform exploration
best_model = SimplePerceptron()
best_model.linear.weight.data.fill_(best_w)
best_model.linear.bias.data.fill_(best_b)

# Plot the best fit line from uniform exploration
plt.figure(figsize=(10, 8))
plt.scatter(X, y)
plt.plot(X, best_model(X).detach().numpy(), color='red', linewidth=3)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Best Fit Line (Uniform Exploration)')
plt.show()

# Gradient descent
model = SimplePerceptron()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
model, losses, grad_time = gradient_descent(model, optimizer, n_epochs=1000)

# Plot the loss curve
plt.figure(figsize=(10, 8))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Loss Curve (Gradient Descent)')
plt.show()

print(f'Gradient Descent - Best weight: {model.linear.weight.item()}, Best bias: {model.linear.bias.item()}, Minimum loss: {losses[-1]}, Time: {grad_time:.4f} seconds')

# Plot the best fit line from gradient descent
plt.figure(figsize=(10, 8))
plt.scatter(X, y)
plt.plot(X, model(X).detach().numpy(), color='red', linewidth=3)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Best Fit Line (Gradient Descent)')
plt.show()


## Neural Network Parameter Search Space on MNIST Dataset

In this notebook, we will explore the search space of parameters of a neural network using the MNIST dataset. We will compare uniform exploration and gradient descent methods.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import time
import tqdm

In [None]:
# Load the MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=100000, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Cross-entropy loss function
criterion = nn.CrossEntropyLoss()

# Uniform exploration of parameter space
def uniform_exploration(n_samples):
    start_time = time.time()
    loss_values = []

    models = []
    for _ in tqdm.tqdm(range(n_samples)):
        model = SimpleNN()
        with torch.no_grad():
            for param in model.parameters():
                param.uniform_(-1, 1)
            model.fc1.weight.data.uniform_(-1, 1)
            model.fc1.bias.data.uniform_(-1, 1)
            model.fc2.weight.data.uniform_(-1, 1)
            model.fc2.bias.data.uniform_(-1, 1)
            model.fc3.weight.data.uniform_(-1, 1)
            model.fc3.bias.data.uniform_(-1, 1)
        models.append(model)

        for images, labels in train_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss_values.append([model.fc1.weight.mean().item(), model.fc1.bias.mean().item(), loss.item()])

    loss_values = np.array(loss_values)
    min_loss = np.min(loss_values[:, 2])
    idx_min_loss = np.argmin(loss_values[:, 2])
    execution_time = time.time() - start_time
    
    return loss_values, min_loss, execution_time, models[idx_min_loss]

# Gradient descent optimization
def gradient_descent(model, optimizer, n_epochs):
    start_time = time.time()
    losses = []

    model.train()
    model.to(device)
    for epoch in tqdm.tqdm(range(n_epochs)):
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    execution_time = time.time() - start_time
    return model, losses, execution_time

In [None]:
# Uniform exploration
loss_values, min_loss, uniform_time, best_model = uniform_exploration(n_samples=10)
print(f'Uniform Exploration - Minimum loss: {min_loss}, Time: {uniform_time:.4f} seconds')

In [None]:
# Gradient descent
model = SimpleNN()
optimizer = optim.SGD(model.parameters(), lr=0.1)
model, losses, grad_time = gradient_descent(model, optimizer, n_epochs=5)

# Plot the loss curve
plt.figure(figsize=(10, 8))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Cross-Entropy Loss')
plt.title('Loss Curve (Gradient Descent)')
plt.show()

print(f'Gradient Descent - Minimum loss: {losses[-1]}, Time: {grad_time:.4f} seconds')

In [None]:
# Use PCA for dimensionality reduction and visualization of multi-dimensional inputs
pca = PCA(n_components=2)
pca_result = pca.fit_transform(loss_values)

plt.figure(figsize=(10, 8))
plt.scatter(pca_result[:, 0], pca_result[:, 1], c=loss_values[:, 2], cmap='viridis')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Loss Landscape')
plt.colorbar(label='Loss')
plt.show()

# Use t-SNE for dimensionality reduction and visualization of multi-dimensional inputs
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(loss_values)

plt.figure(figsize=(10, 8))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=loss_values[:, 2], cmap='viridis')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('t-SNE of Loss Landscape')
plt.colorbar(label='Loss')
plt.show()

In [None]:
# compare outputs between the best model from uniform exploration and gradient descent
def compare_outputs(model1, model2):
    model1.eval()
    model2.eval()
    images, labels = next(iter(train_loader))
    images, labels = images.to(device), labels.to(device)
    outputs1 = model1(images)
    outputs2 = model2(images)
    _, predicted1 = torch.max(outputs1, 1)
    _, predicted2 = torch.max(outputs2, 1)
    return images, predicted1, predicted2

images, predicted_uniform, predicted_grad = compare_outputs(best_model, model)