Define function f and sigmoid function

In [2]:
import numpy as np
import sympy as sp
import scipy as sc

def sigmoid(x):
    return 1 / (1+sp.exp(-x))

def f(w,b):
    return -sp.log(sigmoid(w + b)) - sp.log(sigmoid(1.5 * w + b)) - sp.log(sigmoid(-2 * w - b))

Compute the gradient of f(w,b)

In [3]:
# Define symbols
w, b = sp.symbols('w b')

# Define the function symbolically
f_sym = f(w, b)

# Compute partial derivatives symbolically
df_dw_sym = sp.diff(f_sym, w)
df_db_sym = sp.diff(f_sym, b)

def gradient_f(point):
    '''
    Compute numerical derivatives given a point (w0, b0)
    Return gradient object { "df_dw", "df_db" }
    '''
    w0, b0 = point
    
    df_dw = sp.N(df_dw_sym.subs({w: w0, b: b0}))
    df_db = sp.N(df_db_sym.subs({w: w0, b: b0}))
    
    return { "df_dw": df_dw, "df_db": df_db }

w0, b0 = 1,1

gradient_f((w0,b0))

{'df_dw': 1.67215806159088, 'df_db': 0.757513024779072}

Gradient descent with constant step size

In [4]:
def gradient_descent(f, grad_f, eta, initial_point, max_iter=100):
    w, b = initial_point
    best_value = float('inf')

    for t in range(max_iter):
        gradient = grad_f((w, b))

        step_size = eta(t+1)
        
        # Update parameters
        w = w - step_size * gradient['df_dw']
        b = b - step_size * gradient['df_db']

        # Update best value
        current_value = f(w, b)
        best_value = min(best_value, current_value)

    return f(w, b), best_value

In [5]:
def eta_const(t,c=0.2):
    return c

w0, b0 = 1,1

# Compute gradient descent with eta_const step size strategy
gradient_descent(f, gradient_f, eta_const, (w0,b0))

(1.09322770091888, 1.09322770091888)

Gradient descent with eta_sqrt

In [7]:
def eta_sqrt(t,c=0.2):
    return c/np.sqrt(t+1)

w0, b0 = 1,1

gradient_descent(f, gradient_f, eta_sqrt, (w0,b0))

(1.70722858578009, 1.70722858578009)

Gradient descent with eta_multistep

In [8]:
def eta_multistep(t, milestones=[20,50,80], c=0.2, eta_init=0.2):
    if t < milestones[0]:
        return eta_init
    elif milestones[0] <= t < milestones[1]:
        return eta_init * c
    elif milestones[1] <= t < milestones[2]: 
        return eta_init * c**2 
    else:
        return eta_init * c**3
    

w0, b0 = 1,1

gradient_descent(f, gradient_f, eta_multistep, (w0,b0))

(1.60881900402798, 1.60881900402798)