In [None]:
def parameter_study():
    # Get test functions
    functions = get_test_functions()

    # Test parameters
    sig_max_values = np.linspace(0.001, 0.1, 10)
    sig_min_values = np.linspace(0.0001, 0.01, 10)
    max_iter = 1000

    # Define optimizers to test
    optimizers = ['sgd', 'newton', 'adam', 'adamw', 'rmsprop']

    # Create results directory
    os.makedirs('optimization_results', exist_ok=True)

    # Main summary file
    with open('optimization_results/test_functions_info.txt', 'w') as f:
        f.write("Test Functions Information\n")
        f.write("======================\n\n")
        f.write("Optimizers tested: " + ", ".join(optimizers) + "\n\n")
        for name, func in functions.items():
            f.write(f"\n{func.name}:\n")
            f.write(f"Description: {func.description}\n")
            f.write(f"Global minimum point(s): {func.global_minimum_point}\n")
            f.write(f"Global minimum value: {func.global_minimum_value}\n")
            f.write(f"Domain: {func.domain}\n")
            f.write(f"Recommended starting point: {func.recommended_x0}\n")
            f.write("-" * 50 + "\n")

    # Test each function
    for func_name, f_obj in functions.items():
        print(f"\nTesting function: {func_name}")
        func_dir = f'optimization_results/{func_name}'
        os.makedirs(func_dir, exist_ok=True)

        # Dictionary to store results for all optimizers
        optimizer_results = {}

        # Test each optimizer
        for optimizer in optimizers:
            print(f"\nTesting optimizer: {optimizer}")
            opt_dir = f'{func_dir}/{optimizer}'
            os.makedirs(opt_dir, exist_ok=True)

            all_results = {}

            # Create a single figure for all parameter combinations
            if optimizer == 'sgd':
                plt.figure(figsize=(15, 10))

                # Create colormap for parameter combinations
                n_combinations = len(sig_max_values) * len(sig_min_values)
                colors = plt.cm.viridis(np.linspace(0, 1, n_combinations))
                color_idx = 0

                # Only SGD uses the stochastic parameters
                for sig_max, sig_min in product(sig_max_values, sig_min_values):
                    if sig_min >= sig_max:
                        continue

                    print(f"  sig_max={sig_max:.3f}, sig_min={sig_min:.3f}")
                    result = run_optimization_test(f_obj, sig_max, sig_min, max_iter, optimizer)

                    key = f"max_{sig_max:.3f}_min_{sig_min:.3f}"
                    all_results[key] = result

                    if result['success']:
                        iterations = range(len(result['distances']))
                        plt.plot(iterations, result['distances'],
                                label=f"σ_max={sig_max:.3f}, σ_min={sig_min:.3f}",
                                color=colors[color_idx], alpha=0.5)
                    color_idx += 1

                plt.xlabel('Iterations')
                plt.ylabel(f'Distance from Global Minimum')
                plt.yscale('log')
                plt.title(f'{optimizer.upper()} on {func_name} Function')
                plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
                plt.tight_layout()
                plt.savefig(f'{opt_dir}/convergence.png', dpi=300, bbox_inches='tight')
                plt.close()

            else:
                # Other optimizers don't use stochastic parameters
                result = run_optimization_test(f_obj, 0, 0, max_iter, optimizer)
                all_results['standard'] = result

                # Plot individual optimizer results
                plt.figure(figsize=(15, 10))
                if result['success']:
                    iterations = range(len(result['distances']))
                    plt.plot(iterations, result['distances'], label='standard', alpha=0.7)

                plt.xlabel('Iterations')
                plt.ylabel(f'Distance from Global Minimum')
                plt.yscale('log')
                plt.title(f'{optimizer.upper()} on {func_name} Function')
                plt.legend()
                plt.tight_layout()
                plt.savefig(f'{opt_dir}/convergence.png', dpi=300)
                plt.close()

            optimizer_results[optimizer] = all_results

            # Save detailed results for this optimizer
            with open(f'{opt_dir}/summary.txt', 'w') as f:
                f.write(f"Results for {optimizer.upper()} on {func_name} Function\n")
                f.write("="*50 + "\n\n")
                for key, result in all_results.items():
                    f.write(f"\nParameters: {key}\n")
                    if result['success']:
                        f.write(f"Final distance: {result['final_distance']:.6f}\n")
                        f.write(f"Final value: {result['final_value']:.6f}\n")
                        f.write(f"Iterations: {result['iterations']}\n")
                    else:
                        f.write("Optimization failed\n")
                    f.write("-"*50 + "\n")

        # Plot comparison of all optimizers
        plt.figure(figsize=(15, 10))
        for optimizer, results in optimizer_results.items():
            # For SGD, take the best performing configuration
            if optimizer == 'sgd':
                best_result = min(results.values(),
                                key=lambda x: x['final_distance'] if x['success'] else float('inf'))
            else:
                best_result = results['standard']

            if best_result['success']:
                iterations = range(len(best_result['distances']))
                plt.plot(iterations, best_result['distances'], label=optimizer.upper(), alpha=0.7)

        plt.xlabel('Iterations')
        plt.ylabel('Distance from Global Minimum')
        plt.yscale('log')
        plt.title(f'Optimizer Comparison on {func_name} Function')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'{func_dir}/optimizer_comparison.png', dpi=300)
        plt.close()

# Run the parameter study
parameter_study()


Testing function: sphere

Testing optimizer: sgd
  sig_max=0.001, sig_min=0.000
  sig_max=0.012, sig_min=0.000
  sig_max=0.012, sig_min=0.001
  sig_max=0.012, sig_min=0.002
  sig_max=0.012, sig_min=0.003
  sig_max=0.012, sig_min=0.005
  sig_max=0.012, sig_min=0.006
  sig_max=0.012, sig_min=0.007
  sig_max=0.012, sig_min=0.008
  sig_max=0.012, sig_min=0.009
  sig_max=0.012, sig_min=0.010
  sig_max=0.023, sig_min=0.000
  sig_max=0.023, sig_min=0.001
  sig_max=0.023, sig_min=0.002
  sig_max=0.023, sig_min=0.003
  sig_max=0.023, sig_min=0.005
  sig_max=0.023, sig_min=0.006
  sig_max=0.023, sig_min=0.007
  sig_max=0.023, sig_min=0.008
  sig_max=0.023, sig_min=0.009
  sig_max=0.023, sig_min=0.010
  sig_max=0.034, sig_min=0.000
  sig_max=0.034, sig_min=0.001
  sig_max=0.034, sig_min=0.002
  sig_max=0.034, sig_min=0.003
  sig_max=0.034, sig_min=0.005
  sig_max=0.034, sig_min=0.006
  sig_max=0.034, sig_min=0.007
  sig_max=0.034, sig_min=0.008
  sig_max=0.034, sig_min=0.009
  sig_max=0.034, sig

  plt.tight_layout()



Testing optimizer: newton

Testing optimizer: adam

Testing optimizer: adamw

Testing optimizer: rmsprop

Testing function: rosenbrock

Testing optimizer: sgd
  sig_max=0.001, sig_min=0.000
  sig_max=0.012, sig_min=0.000
  sig_max=0.012, sig_min=0.001
  sig_max=0.012, sig_min=0.002
  sig_max=0.012, sig_min=0.003
  sig_max=0.012, sig_min=0.005
  sig_max=0.012, sig_min=0.006
  sig_max=0.012, sig_min=0.007
  sig_max=0.012, sig_min=0.008
  sig_max=0.012, sig_min=0.009
  sig_max=0.012, sig_min=0.010
  sig_max=0.023, sig_min=0.000
  sig_max=0.023, sig_min=0.001
  sig_max=0.023, sig_min=0.002
  sig_max=0.023, sig_min=0.003
  sig_max=0.023, sig_min=0.005
  sig_max=0.023, sig_min=0.006
  sig_max=0.023, sig_min=0.007
  sig_max=0.023, sig_min=0.008
  sig_max=0.023, sig_min=0.009
  sig_max=0.023, sig_min=0.010
  sig_max=0.034, sig_min=0.000
  sig_max=0.034, sig_min=0.001
  sig_max=0.034, sig_min=0.002
  sig_max=0.034, sig_min=0.003
  sig_max=0.034, sig_min=0.005
  sig_max=0.034, sig_min=0.006
  s

# Full visualisation

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.optimize import minimize
from typing import Callable

def kw_defaults(kw):
    def helper(key, val):
        return val if key not in kw.keys() else kw[key]
    return helper

def get_arg(key, val, f=str):
    s = ' '.join(sys.argv)
    t = '%s='%key
    if( t in s ):
        return f(s.split(t)[-1].split(' ')[0])
    else:
        return val



import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.optimize import minimize
from typing import Callable

import numpy as np

def gd(**kw):
    kwd = kw_defaults(kw)

    #REQUIRED FIELDS
    f = kw['f']
    g = kw['grad']
    x0 = kw['x0']

    #OPTIONAL FIELDS
    tau = kwd('tau', 1e-15)
    c_armijo = kwd('c_armijo', 1e-4)
    c_curvature = kwd('c_curvature', 1e-2)
    alpha = kwd('alpha', 1.0)
    mode = kwd('mode', 'backtracking')
    max_iter = kwd('max_iter', 100)
    verbose = kwd('verbose', True)
    stochastic = kwd('stochastic', lambda x,it: np.zeros(x0.shape))

    #Setup iteration variables
    d = len(x0)
    x = x0
    beta = alpha

    #Setup return values
    x_history = [x]
    f_history = [f(x)]
    g_history = [g(x)]
    g_norm_history = [np.linalg.norm(g_history[-1])]
    r_history = []

    sep = '     '
    print('it: 0%sx: [%s]%sf: %.5e%sgrad_norm: %.5e%sgrad: [%s]%sstep_length: %.5e'%(
        sep,
        ','.join(['%.5e'%e for e in x]), sep,
        f_history[-1], sep,
        g_norm_history[-1], sep,
        ','.join(['%.5e'%e for e in g_history[-1]]), sep,
        beta))

    it = 0
    while(it < max_iter):
        phi = lambda gamma : f(x - gamma * g(x))
        # Derivative of phi with respect to gamma
        phi_prime = lambda gamma : -np.dot(g(x - gamma * g(x)), g(x))

        # Armijo condition: f(x_new) ≤ f(x) - c₁ α ∇f(x)ᵀp
        armijo = lambda gamma : phi(gamma) <= phi(0) - c_armijo * gamma * np.dot(g(x), g(x))

        # Curvature condition: -∇f(x_new)ᵀp ≥ c₂ ∇f(x)ᵀp
        curv = lambda gamma : -phi_prime(gamma) <= -c_curvature * phi_prime(0)

        beta = alpha

        if(mode == 'backtracking'):
            # Check if either Wolfe condition is violated
            while(not armijo(beta) or not curv(beta)):
                # Backtracking step: reduce step size
                beta *= 0.5
                if beta < 1e-10:  # Prevent step size from becoming too small
                    print("Warning: Step size became too small")
                    break

        elif(mode == 'exact'):
            #Exact line search done...note that we have
            #    call an external package to solve an auxiliary opt. problem.
            tmp = minimize(phi, 0.0)
            beta = tmp['x'][0]

        # Update step with stochastic term
        # x_new = x_current - step_size * gradient + stochastic_term
        x = x - beta * g(x) + stochastic(x, it)

        #Update loop variables
        x_history.append(x)
        f_history.append(f(x))
        g_history.append(g(x))
        g_norm_history.append(np.linalg.norm(g_history[-1]))
        r_history.append(abs(f_history[-2] - f_history[-1]))

        it += 1

        #Print debug info
        if(verbose):
            print('it: %d%sx: [%s]%sf: %.5e%sgrad_norm: %.5e%sgrad: [%s]%sstep_length: %.5e%sr: %.5e'%(
                it, sep,
                ','.join(['%.5e'%e for e in x]), sep,
                f_history[-1], sep,
                g_norm_history[-1], sep,
                ','.join(['%.5e'%e for e in g_history[-1]]), sep,
                beta,sep,
                r_history[-1]))

        if(g_norm_history[-1] < tau):
            print('Gradient norm = %.5e < %.5e...breaking'%(np.linalg.norm(g(x)), tau))
            break

    #return values
    return {'x': x_history[-1],
            'val': f_history[-1],
            'x_history': x_history,
            'f_history': f_history,
            'grad_norm_history': g_norm_history,
            'grad_history': g_history,
            'residual': r_history,
            'iterations': it}


def plot_progress(f, x_history, output_name, x_box, y_box, num_levels):
    if(len(x_history) == 0):
        raise(Exception('Empty history list...cannot plot'))

    if(len(x_history[0]) > 2):
        raise(Exception('Domain of dimension %d > 2...cannot plot'%(len(x_history[0]))))

    plt.figure(figsize=(12, 8), dpi=300)  # Higher DPI for better PNG quality

    if(len(x_history[0]) == 1):
        x_history = np.ndarray.flatten(np.array(x_history))
        x_min = min(x_history)
        x_max = max(x_history)
        x = np.array([np.array([e]) for e in np.linspace(x_min, x_max, 1000)])
        colors = cm.rainbow(np.linspace(0, 1, len(x_history)))
        for xx, c in zip(x_history, colors):
            plt.scatter(xx, f(np.array([xx])), color=c)
        plt.plot(np.ndarray.flatten(x), np.array(list(map(f,x))))
        plt.xlabel('X')
        plt.ylabel('f')
    else:
        # Create the contour plot
        X, Y = np.meshgrid(x_box, y_box)
        Z = np.zeros(X.shape)
        for i in range(X.shape[0]):
            for j in range(X.shape[1]):
                Z[i][j] = f(np.array([X[i][j], Y[i][j]]))

        # Plot contour with improved resolution
        bar = plt.contourf(X, Y, Z, num_levels, cmap='viridis')
        plt.colorbar(bar)

        # Plot optimization path with rainbow colors
        colors = cm.rainbow(np.linspace(0, 1, len(x_history)))
        for i, (xx, c) in enumerate(zip(x_history, colors)):
            plt.scatter(xx[0], xx[1], color=c, s=50)
            # Add arrows between consecutive points
            if i < len(x_history) - 1:
                dx = x_history[i+1][0] - xx[0]
                dy = x_history[i+1][1] - xx[1]
                plt.arrow(xx[0], xx[1], dx, dy,
                         head_width=0.2, head_length=0.3,
                         fc=c, ec=c, alpha=0.5)

        # Highlight start and end points with larger markers
        plt.scatter(x_history[0][0], x_history[0][1], color='purple',
                   s=200, label='Start', marker='*')
        plt.scatter(x_history[-1][0], x_history[-1][1], color='red',
                   s=200, label='End', marker='*')

        plt.legend(fontsize=10)
        plt.title(f'Optimization Path ({len(x_history)} steps)', pad=20)
        plt.xlabel('x₁', fontsize=12)
        plt.ylabel('x₂', fontsize=12)

    # Save as PNG with high DPI and tight layout
    plt.tight_layout()
    plt.savefig(f'{output_name}.png', dpi=300, bbox_inches='tight')
    plt.close()
#"""




import numpy as np

def rastrigin(x):
    """
    Implements the n-dimensional Rastrigin function:
    f(x) = An + sum(x_i^2 - A*cos(2π*x_i)) where A = 10

    Parameters:
    x (np.ndarray): Input vector of length n

    Returns:
    float: Function value at x
    """
    A = 10.0
    n = len(x)
    return A*n + np.sum(x**2 - A*np.cos(2*np.pi*x))

def rastrigin_gradient(x):
    """
    Implements the gradient of the n-dimensional Rastrigin function:
    ∇f(x)_i = 2x_i + 2πA*sin(2π*x_i) where A = 10

    Parameters:
    x (np.ndarray): Input vector of length n

    Returns:
    np.ndarray: Gradient vector at x
    """
    A = 10.0
    return 2*x + 2*np.pi*A*np.sin(2*np.pi*x)

# Update your script with:
f = rastrigin
g = rastrigin_gradient



A = 10.0
n = 2

#Setup initial condition
x0 = get_arg('x_init', np.array([25.0, 60.0]), lambda x : np.array(eval(x)))
d = len(x0)

#TODO: Tune parameters for better convergence! Try higher dimensions!
#tolerance level
tau = get_arg('tau', 1e-7, float)

#armijo constant
c_armijo = get_arg('c_armijo', 1e-4, float)

#curvature constant -- note that it must be greater than armijo constant!
c_curvature = get_arg('c_curvature', 1e-2, float)

#max iterations
max_iter = get_arg('max_iter', 1000, int)

#mode -- either "exact" or "backtracking"
mode = get_arg('mode', 'exact', str)

#stochastic terms
sig_max = get_arg('sig_max', 0.05, float)  # experiment: [0.1 - done, 0.2 - done, 0.5]
sig_min = get_arg('sig_min', 0.005, float) # experiment: [0.01 - done, 0.05 - done, 0.1]
output_file = get_arg('output_file', 'output', str)

#TODO: modify rate of decay of stochastic term with iteration number
stochastic = lambda x,it : np.random.randn(d) * ( (max_iter-it) * sig_max + it * sig_min )

#run program and get output
d = gd(f=f, grad=g, x0=x0,
    tau=tau,
    c_armijo=c_armijo,
    c_curvature=c_curvature,
    max_iter=max_iter,
    mode=mode,
    stochastic=stochastic)

#print final results
print('Approx min of %f at %s after %d iterations'%(d['val'], d['x'], d['iterations']))

#define plotting window
x_box = np.linspace(-10,10,1000)
y_box = np.linspace(-10,10,1000)
num_levels = 50

#plot, if possible
try:
    plot_progress(f, d['x_history'], output_file, x_box, y_box, num_levels)
except Exception as e:
    print(e)

it: 0     x: [2.50000e+01,6.00000e+01]     f: 4.22500e+03     grad_norm: 1.30000e+02     grad: [5.00000e+01,1.20000e+02]     step_length: 1.00000e+00
it: 1     x: [8.05185e+01,-1.29965e+01]     f: 6.67207e+03     grad_norm: 1.55702e+02     grad: [1.53746e+02,-2.46010e+01]     step_length: 5.17015e-01     r: 2.44707e+03
it: 2     x: [1.03134e+01,-2.45012e+01]     f: 7.40550e+02     grad_norm: 9.23305e+01     grad: [7.85455e+01,-4.85336e+01]     step_length: 5.23816e-01     r: 5.93152e+03
it: 3     x: [7.27716e-01,-1.28405e+01]     f: 1.81418e+02     grad_norm: 6.65944e+01     grad: [-6.07615e+01,2.72552e+01]     step_length: 2.34148e-01     r: 5.59132e+02
it: 4     x: [3.65513e+00,1.91858e+01]     f: 4.03140e+02     grad_norm: 1.06031e+02     grad: [-4.46863e+01,9.61545e+01]     step_length: 4.23508e-03     r: 2.21722e+02
it: 5     x: [6.80098e+01,6.69634e+01]     f: 9.10971e+03     grad_norm: 1.84038e+02     grad: [1.39876e+02,1.19601e+02]     step_length: 2.09479e-01     r: 8.70657e+0

In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.optimize import minimize
from typing import Callable

def kw_defaults(kw):
    def helper(key, val):
        return val if key not in kw.keys() else kw[key]
    return helper

def get_arg(key, val, f=str):
    s = ' '.join(sys.argv)
    t = '%s='%key
    if( t in s ):
        return f(s.split(t)[-1].split(' ')[0])
    else:
        return val

class Optimizer:
    """Unified interface for different optimization algorithms"""

    @staticmethod
    def gd(f, grad, x0, tau=1e-15, c_armijo=1e-4, c_curvature=1e-2,
           alpha=1.0, mode='backtracking', max_iter=100, verbose=True):
        """Original gradient descent with Wolfe conditions"""
        x = x0
        beta = alpha
        x_history = [x]
        f_history = [f(x)]
        g_history = [grad(x)]
        g_norm_history = [np.linalg.norm(g_history[-1])]
        r_history = []

        sep = '     '
        if verbose:
            print(f'it: 0{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

        it = 0
        while it < max_iter:
            phi = lambda gamma: f(x - gamma * grad(x))
            phi_prime = lambda gamma: -np.dot(grad(x - gamma * grad(x)), grad(x))
            armijo = lambda gamma: phi(gamma) <= phi(0) - c_armijo * gamma * np.dot(grad(x), grad(x))
            curv = lambda gamma: -phi_prime(gamma) <= -c_curvature * phi_prime(0)

            beta = alpha
            if mode == 'backtracking':
                while not armijo(beta) or not curv(beta):
                    beta *= 0.5
                    if beta < 1e-10:
                        if verbose: print("Warning: Step size became too small")
                        break
            elif mode == 'exact':
                tmp = minimize(phi, 0.0)
                beta = tmp['x'][0]

            x = x - beta * grad(x)
            x_history.append(x)
            f_history.append(f(x))
            g_history.append(grad(x))
            g_norm_history.append(np.linalg.norm(g_history[-1]))
            r_history.append(abs(f_history[-2] - f_history[-1]))

            it += 1
            if verbose:
                print(f'it: {it}{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

            if g_norm_history[-1] < tau:
                if verbose:
                    print(f'Gradient norm = {np.linalg.norm(grad(x)):.5e} < {tau:.5e}...breaking')
                break

        return {'x': x_history[-1], 'val': f_history[-1], 'x_history': x_history,
                'f_history': f_history, 'grad_norm_history': g_norm_history,
                'grad_history': g_history, 'residual': r_history, 'iterations': it}

    @staticmethod
    def sgd(f, grad, x0, max_iter=1000, tau=1e-7, alpha=1.0,
            sig_max=0.1, sig_min=0.01, verbose=True):
        """Stochastic Gradient Descent"""
        x = x0
        x_history = [x0]
        f_history = [f(x0)]
        g_history = [grad(x0)]
        stochastic = lambda x,it: np.random.randn(len(x0)) * ((max_iter-it) * sig_max + it * sig_min)

        sep = '     '
        if verbose:
            print(f'it: 0{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

        for it in range(max_iter):
            g = grad(x)
            x = x - alpha * g + stochastic(x, it)

            x_history.append(x)
            f_history.append(f(x))
            g_history.append(g)

            if verbose:
                print(f'it: {it+1}{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

            if np.linalg.norm(g) < tau:
                break

        return {'x': x, 'val': f(x), 'x_history': x_history,
                'f_history': f_history, 'grad_history': g_history,
                'iterations': it + 1}

    @staticmethod
    def adam(f, grad, x0, max_iter=1000, tau=1e-7, alpha=0.001,
             beta1=0.9, beta2=0.999, epsilon=1e-8, verbose=True):
        """Adam Optimizer"""
        x = x0
        x_history = [x0]
        f_history = [f(x0)]
        g_history = [grad(x0)]
        m = np.zeros_like(x0)
        v = np.zeros_like(x0)

        sep = '     '
        if verbose:
            print(f'it: 0{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

        for it in range(max_iter):
            g = grad(x)
            m = beta1 * m + (1 - beta1) * g
            v = beta2 * v + (1 - beta2) * g**2
            m_hat = m / (1 - beta1**(it + 1))
            v_hat = v / (1 - beta2**(it + 1))
            x = x - alpha * m_hat / (np.sqrt(v_hat) + epsilon)

            x_history.append(x)
            f_history.append(f(x))
            g_history.append(g)

            if verbose:
                print(f'it: {it+1}{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

            if np.linalg.norm(g) < tau:
                break

        return {'x': x, 'val': f(x), 'x_history': x_history,
                'f_history': f_history, 'grad_history': g_history,
                'iterations': it + 1}

    @staticmethod
    def rmsprop(f, grad, x0, max_iter=1000, tau=1e-7, alpha=0.001,
                rho=0.9, epsilon=1e-8, verbose=True):
        """RMSprop Optimizer"""
        x = x0
        x_history = [x0]
        f_history = [f(x0)]
        g_history = [grad(x0)]
        v = np.zeros_like(x0)

        sep = '     '
        if verbose:
            print(f'it: 0{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

        for it in range(max_iter):
            g = grad(x)
            v = rho * v + (1 - rho) * g**2
            x = x - alpha * g / (np.sqrt(v) + epsilon)

            x_history.append(x)
            f_history.append(f(x))
            g_history.append(g)

            if verbose:
                print(f'it: {it+1}{sep}x: [{",".join(["%.5e"%e for e in x])}]{sep}f: {f_history[-1]:.5e}')

            if np.linalg.norm(g) < tau:
                break

        return {'x': x, 'val': f(x), 'x_history': x_history,
                'f_history': f_history, 'grad_history': g_history,
                'iterations': it + 1}





# Example usage:
if __name__ == "__main__":
    # Get optimizer type from command line or use default
    opt_type = get_arg('optimizer', 'gd', str)

    # Setup problem
    A = 10.0
    n = 2
    x0 = get_arg('x_init', np.array([25.0, 60.0]), lambda x: np.array(eval(x)))
    tau = get_arg('tau', 1e-7, float)
    max_iter = get_arg('max_iter', 1000, int)
    output_file = get_arg('output_file', 'output', str)

    # Define Rastrigin function and its gradient
    def rastrigin(x):
        A = 10.0
        n = len(x)
        return A*n + np.sum(x**2 - A*np.cos(2*np.pi*x))

    def rastrigin_gradient(x):
        A = 10.0
        return 2*x + 2*np.pi*A*np.sin(2*np.pi*x)

    # Configure optimizer based on type
    if opt_type == 'gd':
        d = Optimizer.gd(f=rastrigin, grad=rastrigin_gradient, x0=x0,
                        tau=tau, max_iter=max_iter)
    elif opt_type == 'sgd':
        d = Optimizer.sgd(f=rastrigin, grad=rastrigin_gradient, x0=x0,
                         tau=tau, max_iter=max_iter)
    elif opt_type == 'adam':
        d = Optimizer.adam(f=rastrigin, grad=rastrigin_gradient, x0=x0,
                          tau=tau, max_iter=max_iter)
    elif opt_type == 'rmsprop':
        d = Optimizer.rmsprop(f=rastrigin, grad=rastrigin_gradient, x0=x0,
                            tau=tau, max_iter=max_iter)
    else:
        raise ValueError(f"Unknown optimizer type: {opt_type}")

    # Print results
    print(f'Approx min of {d["val"]} at {d["x"]} after {d["iterations"]} iterations')

    # Plot results
    x_box = np.linspace(-10, 10, 1000)
    y_box = np.linspace(-10, 10, 1000)
    num_levels = 50

    try:
        plot_progress(rastrigin, d['x_history'], f'{output_file}_{opt_type}', x_box, y_box, num_levels)
    except Exception as e:
        print(e)

it: 0     x: [2.50000e+01,6.00000e+01]     f: 4.22500e+03
it: 1     x: [-3.19744e-14,1.35714e-12]     f: 0.00000e+00
Gradient norm = 5.38640e-10 < 1.00000e-07...breaking
Approx min of 0.0 at [-3.19744231e-14  1.35713663e-12] after 1 iterations


In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

# First create directories for results
def create_dirs():
    base_dir = 'optimization_results'
    for func_name in ['rosenbrock', 'sphere', 'beale', 'himmelblau', 'three_hump_camel']:
        os.makedirs(f'{base_dir}/{func_name}', exist_ok=True)
    return base_dir

# Test functions and their gradients
class TestFunctions:
    @staticmethod
    def rosenbrock(x):
        """Rosenbrock function: global minimum f(1,1)=0"""
        return (1 - x[0])**2 + 100 * (x[1] - x[0]**2)**2

    @staticmethod
    def rosenbrock_gradient(x):
        return np.array([
            -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2),
            200*(x[1] - x[0]**2)
        ])

    @staticmethod
    def sphere(x):
        """Sphere function: global minimum f(0,0)=0"""
        return np.sum(x**2)

    @staticmethod
    def sphere_gradient(x):
        return 2*x

    @staticmethod
    def beale(x):
        """Beale function: global minimum f(3,0.5)=0"""
        return (1.5 - x[0] + x[0]*x[1])**2 + \
               (2.25 - x[0] + x[0]*x[1]**2)**2 + \
               (2.625 - x[0] + x[0]*x[1]**3)**2

    @staticmethod
    def beale_gradient(x):
        t1 = 1.5 - x[0] + x[0]*x[1]
        t2 = 2.25 - x[0] + x[0]*x[1]**2
        t3 = 2.625 - x[0] + x[0]*x[1]**3
        return np.array([
            2*t1*(-1 + x[1]) + 2*t2*(-1 + x[1]**2) + 2*t3*(-1 + x[1]**3),
            2*t1*x[0] + 2*t2*2*x[0]*x[1] + 2*t3*3*x[0]*x[1]**2
        ])

    @staticmethod
    def himmelblau(x):
        """Himmelblau function: 4 local minima"""
        return (x[0]**2 + x[1] - 11)**2 + (x[0] + x[1]**2 - 7)**2

    @staticmethod
    def himmelblau_gradient(x):
        return np.array([
            4*x[0]*(x[0]**2 + x[1] - 11) + 2*(x[0] + x[1]**2 - 7),
            2*(x[0]**2 + x[1] - 11) + 4*x[1]*(x[0] + x[1]**2 - 7)
        ])

    @staticmethod
    def three_hump_camel(x):
        """Three-hump camel function: global minimum f(0,0)=0"""
        return 2*x[0]**2 - 1.05*x[0]**4 + (x[0]**6)/6 + x[0]*x[1] + x[1]**2

    @staticmethod
    def three_hump_camel_gradient(x):
        return np.array([
            4*x[0] - 4.2*x[0]**3 + x[0]**5 + x[1],
            x[0] + 2*x[1]
        ])

# Run benchmarks
def run_benchmarks(base_dir):
    # Define starting points for each function
    starting_points = {
        'rosenbrock': np.array([-1.0, 1.0]),
        'sphere': np.array([2.0, 2.0]),
        'beale': np.array([1.0, 1.0]),
        'himmelblau': np.array([4.0, 4.0]),
        'three_hump_camel': np.array([2.0, 2.0])
    }

    # Define plotting ranges for each function
    plot_ranges = {
        'rosenbrock': ((-2, 2), (-1, 3), 100),
        'sphere': ((-2, 2), (-2, 2), 50),
        'beale': ((-4, 4), (-4, 4), 100),
        'himmelblau': ((-5, 5), (-5, 5), 50),
        'three_hump_camel': ((-2, 2), (-2, 2), 50)
    }

    # Configure optimizers with different parameters for each function
    functions = {
        'rosenbrock': (TestFunctions.rosenbrock, TestFunctions.rosenbrock_gradient),
        'sphere': (TestFunctions.sphere, TestFunctions.sphere_gradient),
        'beale': (TestFunctions.beale, TestFunctions.beale_gradient),
        'himmelblau': (TestFunctions.himmelblau, TestFunctions.himmelblau_gradient),
        'three_hump_camel': (TestFunctions.three_hump_camel, TestFunctions.three_hump_camel_gradient)
    }

    optimizers = {
        'GD': lambda f, g, x0: Optimizer.gd(f, g, x0, tau=1e-7, max_iter=1000, alpha=0.01),
        'SGD': lambda f, g, x0: Optimizer.sgd(f, g, x0, tau=1e-7, max_iter=1000, alpha=0.01),
        'Adam': lambda f, g, x0: Optimizer.adam(f, g, x0, tau=1e-7, max_iter=1000),
        'RMSprop': lambda f, g, x0: Optimizer.rmsprop(f, g, x0, tau=1e-7, max_iter=1000)
    }

    results = {}

    for func_name, (f, g) in functions.items():
        print(f"\nTesting {func_name} function...")
        results[func_name] = {}
        x0 = starting_points[func_name]
        (x_min, x_max), (y_min, y_max), levels = plot_ranges[func_name]

        x_box = np.linspace(x_min, x_max, 200)
        y_box = np.linspace(y_min, y_max, 200)

        for opt_name, opt_func in optimizers.items():
            print(f"  Running {opt_name}...")
            try:
                d = opt_func(f, g, x0)
                results[func_name][opt_name] = d

                # Save plot
                plot_progress(f, d['x_history'],
                            f'{base_dir}/{func_name}/{opt_name.lower()}',
                            x_box, y_box, levels)

                print(f"    Final value: {d['val']:.6f} at {d['x']} after {d['iterations']} iterations")
            except Exception as e:
                print(f"    Error with {opt_name} on {func_name}: {e}")

    return results

# Run everything
base_dir = create_dirs()
results = run_benchmarks(base_dir)

# Print summary
print("\nOptimization Summary:")
print("--------------------")
for func_name, func_results in results.items():
    print(f"\n{func_name.upper()} FUNCTION:")
    for opt_name, d in func_results.items():
        print(f"{opt_name:8s}: f(x)={d['val']:10.6f} | iterations={d['iterations']:4d}")


Testing rosenbrock function...
  Running GD...
    Error with GD on rosenbrock: name 'Optimizer' is not defined
  Running SGD...
    Error with SGD on rosenbrock: name 'Optimizer' is not defined
  Running Adam...
    Error with Adam on rosenbrock: name 'Optimizer' is not defined
  Running RMSprop...
    Error with RMSprop on rosenbrock: name 'Optimizer' is not defined

Testing sphere function...
  Running GD...
    Error with GD on sphere: name 'Optimizer' is not defined
  Running SGD...
    Error with SGD on sphere: name 'Optimizer' is not defined
  Running Adam...
    Error with Adam on sphere: name 'Optimizer' is not defined
  Running RMSprop...
    Error with RMSprop on sphere: name 'Optimizer' is not defined

Testing beale function...
  Running GD...
    Error with GD on beale: name 'Optimizer' is not defined
  Running SGD...
    Error with SGD on beale: name 'Optimizer' is not defined
  Running Adam...
    Error with Adam on beale: name 'Optimizer' is not defined
  Running RMSpr

# BFGS and Trust

- need to omit B-BFGS-L and Trust-exact (can still compare them empirically in the end)

In [3]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import os

class TestFunctions:
    @staticmethod
    def rosenbrock(x):
        """Rosenbrock function: global minimum f(1,1)=0"""
        return (1 - x[0])**2 + 100 * (x[1] - x[0]**2)**2

    @staticmethod
    def rosenbrock_gradient(x):
        return np.array([
            -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2),
            200*(x[1] - x[0]**2)
        ])

    @staticmethod
    def rosenbrock_hessian(x):
        return np.array([
            [2 - 400*x[1] + 1200*x[0]**2, -400*x[0]],
            [-400*x[0], 200]
        ])

    @staticmethod
    def sphere(x):
        """Sphere function: global minimum f(0,0)=0"""
        x = np.asarray(x)  # Convert input to numpy array
        return np.sum(x**2)

    @staticmethod
    def sphere_gradient(x):
        x = np.asarray(x)  # Convert input to numpy array
        return 2*x

    @staticmethod
    def sphere_hessian(x):
        x = np.asarray(x)  # Convert input to numpy array
        return 2*np.eye(len(x))

    @staticmethod
    def beale(x):
        """Beale function: global minimum f(3,0.5)=0"""
        x = np.asarray(x)  # Convert input to numpy array
        return (1.5 - x[0] + x[0]*x[1])**2 + \
               (2.25 - x[0] + x[0]*x[1]**2)**2 + \
               (2.625 - x[0] + x[0]*x[1]**3)**2

    @staticmethod
    def beale_gradient(x):
        x = np.asarray(x)  # Convert input to numpy array
        t1 = 1.5 - x[0] + x[0]*x[1]
        t2 = 2.25 - x[0] + x[0]*x[1]**2
        t3 = 2.625 - x[0] + x[0]*x[1]**3
        return np.array([
            2*t1*(-1 + x[1]) + 2*t2*(-1 + x[1]**2) + 2*t3*(-1 + x[1]**3),
            2*t1*x[0] + 2*t2*2*x[0]*x[1] + 2*t3*3*x[0]*x[1]**2
        ])

    @staticmethod
    def beale_hessian(x):
        # Numerical approximation of Hessian
        eps = 1e-8
        grad = TestFunctions.beale_gradient
        n = len(x)
        H = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                x_ij = x.copy()
                x_ij[i] += eps
                x_ij[j] += eps
                g_ij = grad(x_ij)

                x_i = x.copy()
                x_i[i] += eps
                g_i = grad(x_i)

                x_j = x.copy()
                x_j[j] += eps
                g_j = grad(x_j)

                g_0 = grad(x)

                H[i,j] = (g_ij[i] - g_i[i] - g_j[i] + g_0[i]) / (eps * eps)
        return H

def plot_optimization_path(f, path, title, save_path, plot_range=None):
    """Plot the optimization path over the contour of the function"""
    plt.figure(figsize=(10, 8))

    # Define the grid
    if plot_range is None:
        x_min, x_max = min(p[0] for p in path) - 0.5, max(p[0] for p in path) + 0.5
        y_min, y_max = min(p[1] for p in path) - 0.5, max(p[1] for p in path) + 0.5
    else:
        x_min, x_max, y_min, y_max = plot_range

    x = np.linspace(x_min, x_max, 100)
    y = np.linspace(y_min, y_max, 100)
    X, Y = np.meshgrid(x, y)

    # Calculate Z values
    Z = np.zeros_like(X)
    for i in range(len(x)):
        for j in range(len(y)):
            Z[j, i] = f(np.array([X[j, i], Y[j, i]]))

    # Create contour plot
    plt.contour(X, Y, Z, levels=50)
    plt.colorbar(label='Function Value')

    # Plot optimization path
    path = np.array(path)
    plt.plot(path[:, 0], path[:, 1], 'r.-', label='Optimization Path', linewidth=1, markersize=3)
    plt.plot(path[0, 0], path[0, 1], 'go', label='Start', markersize=10)
    plt.plot(path[-1, 0], path[-1, 1], 'ro', label='End', markersize=10)

    plt.title(title)
    plt.xlabel('x₁')
    plt.ylabel('x₂')
    plt.legend()
    plt.grid(True)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

class OptimizationCallback:
    def __init__(self):
        self.path = []

    def __call__(self, xk):
        self.path.append(np.copy(xk))

def run_bfgs_comparison(save_dir='bfgs_results'):
    os.makedirs(save_dir, exist_ok=True)

    functions = {
        'rosenbrock': (TestFunctions.rosenbrock,
                      TestFunctions.rosenbrock_gradient,
                      TestFunctions.rosenbrock_hessian,
                      (-2, 2, -1, 3)),
        'sphere': (TestFunctions.sphere,
                  TestFunctions.sphere_gradient,
                  TestFunctions.sphere_hessian,
                  (-2, 2, -2, 2)),
        'beale': (TestFunctions.beale,
                 TestFunctions.beale_gradient,
                 TestFunctions.beale_hessian,
                 (-4, 4, -4, 4))
    }

    starting_points = {
        'rosenbrock': np.array([-21.0, 53.0]),
        'sphere': np.array([72.0, 15.0]),
        'beale': np.array([31.0, -61.0])
    }

    methods = ['BFGS', 'L-BFGS-B', 'trust-exact', 'trust-krylov']
    results = {}

    for func_name, (f, g, h, plot_range) in functions.items():
        print(f"\nOptimizing {func_name} function:")
        results[func_name] = {}
        x0 = starting_points[func_name]

        for method in methods:
            try:
                callback = OptimizationCallback()

                if method == 'L-BFGS-B':
                    bounds = [(-10, 10) for _ in range(len(x0))]
                    result = minimize(f, x0, method=method, jac=g, bounds=bounds, callback=callback)
                elif method in ['trust-exact', 'trust-krylov']:
                    result = minimize(f, x0, method=method, jac=g, hess=h, callback=callback)
                else:
                    result = minimize(f, x0, method=method, jac=g, callback=callback)

                results[func_name][method] = {
                    'success': result.success,
                    'x': result.x,
                    'fun': result.fun,
                    'nit': result.nit,
                    'nfev': result.nfev,
                    'path': callback.path
                }

                # Plot optimization path
                save_path = f"{save_dir}/{func_name}_{method.lower()}.png"
                plot_optimization_path(
                    f,
                    callback.path,
                    f"{func_name.capitalize()} - {method}\nIterations: {result.nit}, Final value: {result.fun:.6f}",
                    save_path,
                    plot_range
                )

                print(f"\n{method}:")
                print(f"  Solution: {result.x}")
                print(f"  Minimum value: {result.fun}")
                print(f"  Iterations: {result.nit}")
                print(f"  Function evaluations: {result.nfev}")
                print(f"  Success: {result.success}")

            except Exception as e:
                print(f"Error with {method} on {func_name}: {e}")
                results[func_name][method] = None

    return results

# Run optimization with visualization
results = run_bfgs_comparison()


Optimizing rosenbrock function:

BFGS:
  Solution: [1.00000004 1.00000009]
  Minimum value: 2.0271950994200227e-15
  Iterations: 149
  Function evaluations: 192
  Success: True

L-BFGS-B:
  Solution: [1.         0.99999999]
  Minimum value: 2.1958257086606548e-17
  Iterations: 26
  Function evaluations: 34
  Success: True

trust-exact:
  Solution: [0.99999993 0.99999984]
  Minimum value: 9.224714923111498e-15
  Iterations: 69
  Function evaluations: 70
  Success: True

trust-krylov:
  Solution: [0.99992974 0.99985921]
  Minimum value: 4.943762463721391e-09
  Iterations: 101
  Function evaluations: 102
  Success: True

Optimizing sphere function:

BFGS:
  Solution: [ 2.55351296e-15 -1.18793864e-14]
  Minimum value: 1.4764024879277e-28
  Iterations: 3
  Function evaluations: 6
  Success: True

L-BFGS-B:
  Solution: [0. 0.]
  Minimum value: 0.0
  Iterations: 2
  Function evaluations: 4
  Success: True

trust-exact:
  Solution: [1.77635684e-15 4.44089210e-16]
  Minimum value: 3.3526588471

KeyboardInterrupt: 

# Newton-Rhapson

In [5]:
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
import os
from numpy.linalg import solve, LinAlgError

class NewtonOptimizer:
    def __init__(self, f, grad, hess, max_iter=100, tol=1e-7, verbose=True):
        self.f = f
        self.grad = grad
        self.hess = hess
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
        self.path = []

    def optimize(self, x0):
        """
        Newton-Raphson optimization method.
        Uses the update rule: x_{k+1} = x_k - H^{-1}(x_k)∇f(x_k)
        where H is the Hessian matrix and ∇f is the gradient.
        """
        x = np.asarray(x0)
        self.path = [x.copy()]

        if self.verbose:
            print(f"Starting Newton-Raphson optimization from {x}")
            print(f"{'Iteration':^10} {'Function Value':^15} {'Gradient Norm':^15}")

        for it in range(self.max_iter):
            f_val = self.f(x)
            g = self.grad(x)
            H = self.hess(x)
            grad_norm = np.linalg.norm(g)

            if self.verbose:
                print(f"{it:^10d} {f_val:^15.6e} {grad_norm:^15.6e}")

            if grad_norm < self.tol:
                if self.verbose:
                    print(f"Converged: gradient norm {grad_norm} < tolerance {self.tol}")
                break

            try:
                # Solve H*dx = -g for dx
                dx = solve(H, -g)

                # Update x
                x = x + dx
                self.path.append(x.copy())

            except LinAlgError:
                if self.verbose:
                    print("Warning: Singular Hessian encountered. Using gradient descent step.")
                # Fallback to gradient descent with small step size
                x = x - 0.01 * g
                self.path.append(x.copy())

        return {
            'x': x,
            'success': grad_norm < self.tol,
            'iterations': it + 1,
            'path': self.path,
            'fun': self.f(x),
            'grad_norm': grad_norm
        }

def plot_optimization_path(f, path, title, save_path, plot_range=None):
    """Plot the optimization path over the contour of the function"""
    plt.figure(figsize=(10, 8))

    # Define the grid
    if plot_range is None:
        x_min, x_max = min(p[0] for p in path) - 0.5, max(p[0] for p in path) + 0.5
        y_min, y_max = min(p[1] for p in path) - 0.5, max(p[1] for p in path) + 0.5
    else:
        x_min, x_max, y_min, y_max = plot_range

    x = np.linspace(x_min, x_max, 100)
    y = np.linspace(y_min, y_max, 100)
    X, Y = np.meshgrid(x, y)

    # Calculate Z values
    Z = np.zeros_like(X)
    for i in range(len(x)):
        for j in range(len(y)):
            Z[j, i] = f(np.array([X[j, i], Y[j, i]]))

    # Create contour plot
    plt.contour(X, Y, Z, levels=50)
    plt.colorbar(label='Function Value')

    # Plot optimization path
    path = np.array(path)
    plt.plot(path[:, 0], path[:, 1], 'r.-', label='Optimization Path', linewidth=1, markersize=3)
    plt.plot(path[0, 0], path[0, 1], 'go', label='Start', markersize=10)
    plt.plot(path[-1, 0], path[-1, 1], 'ro', label='End', markersize=10)

    plt.title(title)
    plt.xlabel('x₁')
    plt.ylabel('x₂')
    plt.legend()
    plt.grid(True)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

# Example usage with test functions
class TestFunctions:
    @staticmethod
    def rosenbrock(x):
        return (1 - x[0])**2 + 100 * (x[1] - x[0]**2)**2

    @staticmethod
    def rosenbrock_gradient(x):
        return np.array([
            -2*(1 - x[0]) - 400*x[0]*(x[1] - x[0]**2),
            200*(x[1] - x[0]**2)
        ])

    @staticmethod
    def rosenbrock_hessian(x):
        return np.array([
            [2 - 400*x[1] + 1200*x[0]**2, -400*x[0]],
            [-400*x[0], 200]
        ])

    @staticmethod
    def sphere(x):
        x = np.asarray(x)
        return np.sum(x**2)

    @staticmethod
    def sphere_gradient(x):
        return 2*x

    @staticmethod
    def sphere_hessian(x):
        return 2*np.eye(len(x))

# Run comparison
def run_newton_comparison(save_dir='newton_results'):
    os.makedirs(save_dir, exist_ok=True)

    functions = {
        'rosenbrock': (TestFunctions.rosenbrock,
                      TestFunctions.rosenbrock_gradient,
                      TestFunctions.rosenbrock_hessian,
                      (-2, 2, -1, 3)),
        'sphere': (TestFunctions.sphere,
                  TestFunctions.sphere_gradient,
                  TestFunctions.sphere_hessian,
                  (-2, 2, -2, 2))
    }

    starting_points = {
        'rosenbrock': np.array([-73.0, 28.0]),
        'sphere': np.array([332.0, 129.0])
    }

    results = {}

    for func_name, (f, g, h, plot_range) in functions.items():
        print(f"\nOptimizing {func_name} function with Newton-Raphson:")
        x0 = starting_points[func_name]

        # Run Newton-Raphson
        optimizer = NewtonOptimizer(f, g, h)
        result = optimizer.optimize(x0)

        results[func_name] = result

        # Plot optimization path
        save_path = f"{save_dir}/{func_name}_newton.png"
        plot_optimization_path(
            f,
            result['path'],
            f"{func_name.capitalize()} - Newton-Raphson\n" +
            f"Iterations: {result['iterations']}, Final value: {result['fun']:.6f}",
            save_path,
            plot_range
        )

        print(f"\nResults:")
        print(f"  Solution: {result['x']}")
        print(f"  Minimum value: {result['fun']}")
        print(f"  Iterations: {result['iterations']}")
        print(f"  Success: {result['success']}")
        print(f"  Final gradient norm: {result['grad_norm']}")

    return results

# Run the comparison
results = run_newton_comparison()


Optimizing rosenbrock function with Newton-Raphson:
Starting Newton-Raphson optimization from [-73.  28.]
Iteration  Function Value   Gradient Norm 
    0       2.810066e+09    1.547930e+08  
    1       5.475990e+03    1.480000e+02  
    2       2.998635e+09    2.448791e+06  
    3       5.202136e-09    1.442517e-04  
    4       2.706222e-15    2.326466e-06  
    5       0.000000e+00    0.000000e+00  
Converged: gradient norm 0.0 < tolerance 1e-07

Results:
  Solution: [1. 1.]
  Minimum value: 0.0
  Iterations: 6
  Success: True
  Final gradient norm: 0.0

Optimizing sphere function with Newton-Raphson:
Starting Newton-Raphson optimization from [332. 129.]
Iteration  Function Value   Gradient Norm 
    0       1.268650e+05    7.123623e+02  
    1       0.000000e+00    0.000000e+00  
Converged: gradient norm 0.0 < tolerance 1e-07

Results:
  Solution: [0. 0.]
  Minimum value: 0.0
  Iterations: 2
  Success: True
  Final gradient norm: 0.0
