In [1]:
import numpy as np
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import time
import plotly.express as px
import numpy as np
from tqdm import tqdm
from plotly.subplots import make_subplots
import torch

  from .autonotebook import tqdm as notebook_tqdm


We will try to find the minimum over x of : $ \frac{1}{n}\| Ax - b \|^2$ by using various zero-order optimization methods.

In [None]:
data = pd.read_csv('data/communities.csv', header=None)
data = data.drop(labels=range(5), axis=1) # drop first 5 attributes (non predictive)
data = data.replace('?',np.nan)
data = data.dropna()

A = data.iloc[:,:].to_numpy(dtype='float64')
A[:,-1] = 1 # fixed input for each example for the bias term
b = data.iloc[:,-1].to_numpy(dtype='float64') # values to predict

In [None]:
max_iters = 210000
n_prec = 4

In [None]:
# This class stores the iteration indexes and execution time
# at which an optimization algorithm manages to have an error 
# lower than the 'precisions' values given. 
class PrecisionHolder:
    def __init__(self, real_value, precisions=[0.1, 0.05, 0.01, 0.001]):
        self.real_value = real_value
        self.precisions = precisions
        self.n = len(precisions)
        self.precisions_itr = -np.ones(self.n, dtype='int64')
        self.precisions_tim = -np.ones(self.n, dtype='float64')
        self.ptr = 0 # keep track of which value has to be reached now
        
        
    def notifyValue(self, value, itr, time):
        error = abs(value - self.real_value)
        for i in range(self.n):
            if error < self.precisions[i]:
                if self.precisions_itr[i] < 0:
                    self.precisions_itr[i] = itr + 1
                    self.precisions_tim[i] = time
                    if self.ptr == self.n-1: # this means we reached all values needed
                        self.ptr = -1
                    else:
                        self.ptr += 1
            else:
                break
    
    
    def __str__(self):
        txt = ''
        for precision,itr in zip(self.precisions,self.precisions_itr):
            if itr >= 0:
                txt += f'Precision of {precision} reached after {itr} iterations.\n'
            else:
                txt += f'Precision of {precision} not reached.\n'
        return txt
    
    
    def allValuesAcquired(self):
        return self.ptr < 0

## I/ Gradient Descent

In [None]:
#Computes smoothness constant L for f
def calculate_L(A):
    eig = np.linalg.eigvals(A.T.dot(A))
    L = max(eig)/(A.shape[0])    
    return 2*L


def train_bgd_reg_lin(A, p_holder, max_iters=210000, verbose=False):
    x = torch.zeros(A.shape[1], dtype=torch.float64, requires_grad=True)

    # define the model
    def forward(x,A):
        return A@x

    loss = torch.nn.MSELoss()
    learning_rate = 1 / calculate_L(A)
    optimizer = torch.optim.SGD(params=[x], lr=learning_rate)

    A_t = torch.tensor(A, dtype=torch.float64)
    b_t = torch.tensor(b, dtype=torch.float64)

    start_time = time.time()
    for n_iter in range(max_iters):
        b_pred = forward(x,A_t)
        l = loss(b_t, b_pred)
        p_holder.notifyValue(l, n_iter, time.time()-start_time)
        if p_holder.allValuesAcquired():
            break
        l.backward()
        optimizer.step()
        optimizer.zero_grad()

        if verbose and n_iter%10000==9999 and n_iter!=0:
            print(f'Loss for iteration {n_iter}/{max_iters-1} : {l.item()}')

    if verbose:
        print(f'Execution time : ', time.time()-start_time)

In [None]:
itr_gd = np.zeros([A.shape[1],n_prec],dtype='int64')
# itr_gd[i,j] : nb iterations to reach precision 10^(1-j) with i dimensions
tim_gd = np.zeros([A.shape[1],n_prec],dtype='float64')
for i in tqdm(range(A.shape[1]), desc="Nb of dimensions"):
    A_temp = A[:,:i+1]
    lowest_x = np.linalg.inv(A_temp.T@A_temp)@A_temp.T@b
    lowest_loss = np.sum(np.square(A_temp@lowest_x-b))/A_temp.shape[0]
    p_holder_gd = PrecisionHolder(real_value=lowest_loss)
    train_bgd_reg_lin(A_temp, p_holder_gd)
    itr_gd[i,:] = p_holder_gd.precisions_itr
    tim_gd[i,:] = p_holder_gd.precisions_tim

itr_gd_ZO = itr_gd[:70,:]
tim_gd_ZO = tim_gd[:70,:]

## II/ Random Optimization

In [None]:
# Random Optimization
# f : function to minimize
# d : number of dimension of input
# sigma : hyperparameter of the search (how far can the new point be)
# max_iters : number of iterations to do
def random_optimization(f, d, p_holder, mu=0, sigma=1, max_iters=210000, verbose=False):
    np.random.seed(0) # reproducibility
    
    f_x = np.inf
    x = np.zeros(d)

    start_time = time.time()
    for n_iter in range(max_iters):
        x_cand = x + np.random.normal(0, sigma, d)
        f_x_cand = f(x_cand)
        if f_x_cand < f_x:
            f_x = f_x_cand
            x = x_cand
        p_holder.notifyValue(f_x, n_iter, time.time()-start_time)
        if verbose and n_iter%10000==9999 and n_iter!=0:
            print(f'Loss for iteration {n_iter}/{max_iters-1} : {f_x}')
        if p_holder.allValuesAcquired():
            break
# Random optimization for the MSE
def random_optimization_mse(A, b, p_holder, mu=0, sigma=1, max_iters=100000, verbose=False):
    return random_optimization(lambda x: np.sum(np.square(A@x-b))/A.shape[0], A.shape[1], p_holder, mu, sigma, max_iters, verbose)

In [None]:
# Define various standard deviations
sigmas = [0.001, 0.005, 0.01,0.1]
# Initialization of variables to plot: number of iterations until convergence "itr" and corresponding time to convergence "tim"
itr_ro = np.zeros([len(sigmas),A.shape[1],n_prec], dtype='int64')
tim_ro = np.zeros([len(sigmas),A.shape[1],n_prec], dtype='float64')
# For each dimension : minimize error until convergence criterion reached
# Store number of iterations in array "itr" and respective computational time in array "tim"
for i in tqdm(range(A.shape[1]), desc='Nb of dimensions'):
    A_temp = A[:,:i+1]
    lowest_x = np.linalg.inv(A_temp.T@A_temp)@A_temp.T@b
    lowest_loss = np.sum(np.square(A_temp@lowest_x-b))/A_temp.shape[0]
    for j,sigma in enumerate(sigmas):
        p_holder_ro = PrecisionHolder(real_value=lowest_loss)
        random_optimization_mse(A, b, p_holder_ro, sigma=sigma, max_iters=max_iters)
        itr_ro[j,i,:] = p_holder_ro.precisions_itr
        tim_ro[j,i,:] = p_holder_ro.precisions_tim      

## III/ Adaptive Step Size Random Search

In [None]:
class ASSRS:
    def __init__(self, func, nb_dim, step, a, i2_limit, step_decrease, i1_freq, step_increase):
        np.random.seed(0) # reproducibility
        self.func = func
        self.nb_dim = nb_dim
        self.step = step
        self.a = a
        self.i2_limit = i2_limit
        self.step_decrease = step_decrease
        self.i1_freq = i1_freq
        self.step_increase = step_increase
        
        self.reset()
    
    
    def reset(self):
        self.i1 = 0
        self.i2 = 0
        self.x = np.zeros(self.nb_dim)
        self.f_x = np.inf
        
    def random_point_hypersphere(self, step):
        x = np.random.normal(0, 1, size=(self.nb_dim,))
        x_rad = np.linalg.norm(x)
        return (x / x_rad) * step
    
    def compare_step_sizes(self, step_1, step_2):
        x_1 = self.x + self.random_point_hypersphere(step_1)
        x_2 = self.x + self.random_point_hypersphere(step_2)
        f_1 = self.func(x_1)
        f_2 = self.func(x_2)
        return (x_1,f_1,step_1) if f_1 < f_2 else (x_2,f_2,step_2)
    
    def iterate(self, verbose=False):
        # 1 Step size of nominal step size
        # 1 Step size of large step size
        if self.i1%self.i1_freq==0 and self.i1!=0:
            x_cand,f_cand,step_cand = self.compare_step_sizes(self.step, self.step+self.step_increase)
            if f_cand < self.f_x:
                self.x,self.f_x,self.step = x_cand,f_cand,step_cand
                if verbose:
                    print(f'Size step increased to : {self.step}, loss : {self.f_x}')
        
        larger_step = self.step*(1+self.a)
        x_cand,f_cand,step_cand = self.compare_step_sizes(self.step, larger_step)
        
        if f_cand < self.f_x:
            # One step produced an improvement
            self.x,self.f_x,self.step = x_cand, f_cand,step_cand
            self.i2 = 0
            if verbose:
                print(f'Size step which produced improvement : {self.step}, loss : {self.f_x}')
        else:
            # No steps produced an improvement
            self.i2 = self.i2 + 1
            if self.i2 == self.i2_limit:
                # No improvement for a long time, reduce step size
                self.step *= 1-self.step_decrease
                self.i2 = 0
                if verbose:
                    print(f'No improvement for a long time, reduce step size to : ', self.step, ', Loss : ', self.f_x)

In [None]:
step = [0.1, 0.1, 0.5, 0.5, 1, 1]
a = [0.01, 0.001, 0.01, 0.001, 0.01, 0.001]
i2_limit = [10, 10, 10, 10, 10, 10]
i1_freq = [10, 10, 10, 10, 10, 10]
step_decrease = [0.01, 0.001, 0.01, 0.001, 0.01, 0.001]
step_increase = [2, 2, 2, 2, 2, 2]

nb_assrs = len(step)

itr_assrs = np.zeros([nb_assrs, A.shape[1], n_prec], dtype='int64')
tim_assrs = np.zeros([nb_assrs, A.shape[1], n_prec], dtype='float64')

for i in tqdm(range(A.shape[1]), desc='Nb of dimensions'):
    A_temp = A[:,:i+1]
    lowest_x = np.linalg.inv(A_temp.T@A_temp)@A_temp.T@b
    lowest_loss = np.sum(np.square(A_temp@lowest_x-b))/A_temp.shape[0]
    for j in range(nb_assrs):
        p_holder_assrs = PrecisionHolder(real_value=lowest_loss)
        assrs = ASSRS(func=lambda x: np.sum(np.square(A_temp@x-b))/A_temp.shape[0],
                  nb_dim=A_temp.shape[1],
                  step=step[j],
                  a=a[j],
                  i2_limit=i2_limit[j],
                  step_decrease=step_decrease[j],
                  i1_freq=i1_freq[j],
                  step_increase=step_increase[j])
        start_time = time.time()
        for n_iter in range(max_iters):
            assrs.i1 = n_iter
            assrs.iterate()
            p_holder_assrs.notifyValue(assrs.f_x, n_iter, time.time()-start_time)
            if p_holder_assrs.allValuesAcquired():
                break
        itr_assrs[j,i,:] = p_holder_assrs.precisions_itr
        tim_assrs[j,i,:] = p_holder_assrs.precisions_tim

## IV/ ZO SGD

In [None]:
# Zero Order SGD
# f : function to minimize
# f_augm : function to minimize for x of dimension (row,col,m)
# d : number of dimension of input
# m : number of random vectors u_j following a normal distribution
# mean : mean of the normal distribution of the random vectors u_j
# std : standard deviation of the random vectors u_j
# delta : step size (corresponding to mu in Latex report)
# eta : learning rate

def ZO_SGD(f, f_augm, d, p_holder, m=1000, mean =0, std = 0.5, delta=0.1, eta=0.1):

    x = np.zeros(d)

    m_t = np.zeros(d)

    start_time = time.time()

    f_x = f(x)
    for n_iter in range(max_iters):
        grad_k = 0
        x_temp = np.tile(x.copy(),(m,1))
        u_j = np.random.normal(mean, std, (m, np.size(x)))
        x_temp += delta*u_j 
        f_x_du = np.expand_dims(f_augm(x_temp), axis=0)
        grad_k = np.mean(np.multiply((np.subtract(f_x_du.T, f_x))/delta,u_j), axis=0)
        x -= eta*grad_k
        f_x = f(x)
        p_holder.notifyValue(f_x, n_iter, time.time()-start_time)
        if p_holder.allValuesAcquired():
            break

def ZO_SGD_mse(A, b, p_holder):
    return ZO_SGD(lambda x: np.sum(np.square(A@x-b))/A.shape[0], lambda x: np.sum(np.square(np.matmul(A,x.T).T-b), axis=1)/A.shape[0], A.shape[1], p_holder)


In [None]:
# Initialization of variables to plot: number of iterations until convergence "itr" and corresponding time to convergence "tim"
itr_ZO_SGD = np.zeros([A.shape[1]-53,n_prec],dtype='int64')
tim_ZO_SGD = np.zeros([A.shape[1]-53,n_prec],dtype='float64')
# For each dimension : minimize error until convergence criterion reached
# Store number of iterations in array "itr" and respective computational time in array "tim"
for i in tqdm(range(A.shape[1]-53), desc="Nb of dimensions"):
    A_temp = A[:,:i+1]
    lowest_x = np.linalg.inv(A_temp.T@A_temp)@A_temp.T@b
    lowest_loss = np.sum(np.square(A_temp@lowest_x-b))/A_temp.shape[0]
    p_holder_ZO_SGD = PrecisionHolder(real_value=lowest_loss)
    ZO_SGD_mse(A_temp,b, p_holder_ZO_SGD)
    itr_ZO_SGD[i,:] = p_holder_ZO_SGD.precisions_itr
    tim_ZO_SGD[i,:] = p_holder_ZO_SGD.precisions_tim

## V/ ZO NAG

In [None]:
# Zero Order NAG
# f : function to minimize
# f_augm : function to minimize for x of dimension (row,col,m)
# d : number of dimension of input
# m : number of random vectors u_j following a normal distribution
# mean : mean of the normal distribution of the random vectors u_j
# std : standard deviation of the random vectors u_j
# delta : step size (corresponding to mu in Latex report)
# beta : momentum factor in NAG formulation
# eta : learning rate
def ZO_NAG(f, f_augm, d, p_holder, m=1000, mean =0, std = 0.5, delta=0.1, beta=0.5, eta=0.1):
    x = np.zeros(d)
    m_t = np.zeros(d)
    start_time = time.time()
    f_x = f(x)
    for n_iter in range(max_iters):
        grad_k = 0
        x_temp = np.tile(x.copy(),(m,1))
        u_j = np.random.normal(mean, std, (m, np.size(x)))
        x_temp += delta*u_j - beta*m_t
        f_x_du = np.expand_dims(f_augm(x_temp), axis=0)
        grad_k = np.mean(np.multiply((np.subtract(f_x_du.T, f_x))/delta,u_j), axis=0)
        m_t = beta*m_t + eta*grad_k
        x -= m_t
        f_x = f(x)
        p_holder.notifyValue(f_x, n_iter, time.time()-start_time)
        if p_holder.allValuesAcquired():
            break

def ZO_NAG_mse(A, b, p_holder):
    return ZO_NAG(lambda x: np.sum(np.square(A@x-b))/A.shape[0], lambda x: np.sum(np.square(np.matmul(A,x.T).T-b), axis=1)/A.shape[0], A.shape[1], p_holder)


In [None]:
# Initialization of variables to plot: number of iterations until convergence "itr" and corresponding time to convergence "tim"
itr_ZO_NAG = np.zeros([A.shape[1]-53,n_prec],dtype='int64')
tim_ZO_NAG = np.zeros([A.shape[1]-53,n_prec],dtype='float64')
# For each dimension : minimize error until convergence criterion reached
# Store number of iterations in array "itr" and respective computational time in array "tim"
for i in tqdm(range(A.shape[1]-53), desc="Nb of dimensions"):
    A_temp = A[:,:i+1]
    lowest_x = np.linalg.inv(A_temp.T@A_temp)@A_temp.T@b
    lowest_loss = np.sum(np.square(A_temp@lowest_x-b))/A_temp.shape[0]
    p_holder_ZO_NAG = PrecisionHolder(real_value=lowest_loss)
    ZO_NAG_mse(A_temp,b, p_holder_ZO_NAG)
    itr_ZO_NAG[i,:] = p_holder_ZO_NAG.precisions_itr
    tim_ZO_NAG[i,:] = p_holder_ZO_NAG.precisions_tim

In [None]:
# Run to save numpy arrays
with open('./saved_arrays/itr_gd.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_gd)
with open('./saved_arrays/tim_gd.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_gd)
with open('./saved_arrays/itr_gd_ZO.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_gd_ZO)
with open('./saved_arrays/tim_gd_ZO.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_gd_ZO)
with open('./saved_arrays/itr_ro.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_ro)
with open('./saved_arrays/tim_ro.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_ro)
with open('./saved_arrays/itr_assrs.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_assrs)
with open('./saved_arrays/tim_assrs.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_assrs)
with open('./saved_arrays/itr_zosgd.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_ZO_SGD)
with open('./saved_arrays/tim_zosgd.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_ZO_SGD)
with open('./saved_arrays/itr_zonag.npy', 'wb') as file_to_store:
    np.save(file_to_store, itr_ZO_NAG)
with open('./saved_arrays/tim_zonag.npy', 'wb') as file_to_store:
    np.save(file_to_store, tim_ZO_NAG)

In [2]:
# Run to load numpy arrays
with open('./saved_arrays/itr_gd.npy', 'rb') as file_to_read:
    itr_gd = np.load(file_to_read)
with open('./saved_arrays/tim_gd.npy', 'rb') as file_to_read:
    tim_gd = np.load(file_to_read)
with open('./saved_arrays/itr_gd_ZO.npy', 'rb') as file_to_read:
    itr_gd_ZO = np.load(file_to_read)
with open('./saved_arrays/tim_gd_ZO.npy', 'rb') as file_to_read:
    tim_gd_ZO = np.load(file_to_read)
with open('./saved_arrays/itr_ro.npy', 'rb') as file_to_read:
    itr_ro = np.load(file_to_read)
with open('./saved_arrays/tim_ro.npy', 'rb') as file_to_read:
    tim_ro = np.load(file_to_read)
with open('./saved_arrays/itr_assrs.npy', 'rb') as file_to_read:
    itr_assrs = np.load(file_to_read)
with open('./saved_arrays/tim_assrs.npy', 'rb') as file_to_read:
    tim_assrs = np.load(file_to_read)
with open('./saved_arrays/itr_ZO_SGD.npy', 'rb') as file_to_read:
    itr_ZO_SGD = np.load(file_to_read)
with open('./saved_arrays/tim_ZO_SGD.npy', 'rb') as file_to_read:
    tim_ZO_SGD = np.load(file_to_read)
with open('./saved_arrays/itr_ZO_NAG.npy', 'rb') as file_to_read:
    itr_ZO_NAG = np.load(file_to_read)
with open('./saved_arrays/tim_ZO_NAG.npy', 'rb') as file_to_read:
    tim_ZO_NAG = np.load(file_to_read)

In [None]:
tim_gd[np.argwhere(tim_gd==0)] = np.finfo(np.float64).eps

# Results

In [3]:
# Plot results
def plot_result(itr_zo, itr_gd):
    fig = make_subplots(rows=2, cols=2, subplot_titles=('delta : 10^-1','delta : 5x10^-2','delta : 10^-2','delta : 10^-3'))
    for i in range(2):
        for j in range(2):
            if i*2+j >= 5:
                break
            fig.add_trace(go.Scatter(x=np.arange(1,np.shape(itr_gd)[0]+1), y=np.arange(1,np.shape(itr_gd)[0]+1), marker_color='rgba(0,0,0,255)'), row=i+1,col=j+1)
            x_list = np.arange(1,np.shape(itr_gd)[0]+1)
            y_list = itr_zo[:,i*2+j]/itr_gd[:,i*2+j]
            color_list = [0 if x > 0 else 1 for x in itr_zo[:,i*2+j]/itr_gd[:,i*2+j]]
            for tn in range(np.shape(itr_gd)[0]):
                fig.add_trace(
                    go.Scatter(
                        x=x_list[tn:tn+2],
                        y=y_list[tn:tn+2],
                        line_color=px.colors.qualitative.Plotly[color_list[tn]],
                        mode='lines'
                    ), row=i+1, col=j+1
                )
            fig.update_xaxes(title_text='Number of dimensions', row=i+1, col=j+1)
            fig.update_yaxes(title_text='Itr zo / Itr gd', row=i+1, col=j+1)

    fig.update_layout(showlegend=False, height=1000, width=1000)
    fig.show()
    
def plot_time(tim_zo, tim_gd):
    tim_gd[np.argwhere(tim_gd == 0)] = np.finfo(np.float64).eps
    fig = make_subplots(rows=2, cols=2)
    for i in range(2):
        for j in range(2):
            if i*2+j >= 5:
                break
            fig.add_trace(go.Scatter(x=np.arange(1,np.shape(tim_gd)[0]+1),y=tim_zo[:,i*2+j]/tim_gd[:,i*2+j]), row=i+1, col=j+1)
    fig.show()

## Results for RO

## Sigma = 0.001

In [4]:
plot_result(itr_ro[0,:,:], itr_gd)

## Sigma = 0.005

In [5]:
plot_result(itr_ro[1,:,:], itr_gd)

## Sigma = 0.01

In [6]:
plot_result(itr_ro[2,:,:], itr_gd)

## Sigma = 0.1

In [7]:
plot_result(itr_ro[3,:,:], itr_gd)

# ASSRS

In [8]:
plot_result(itr_assrs[0,:,:], itr_gd)

In [9]:
plot_result(itr_assrs[1,:,:], itr_gd)

In [10]:
plot_result(itr_assrs[2,:,:], itr_gd)

In [11]:
plot_result(itr_assrs[3,:,:], itr_gd)

In [12]:
plot_result(itr_assrs[4,:,:], itr_gd)

In [13]:
plot_result(itr_assrs[5,:,:], itr_gd)

# ZO SGD

In [14]:
plot_result(itr_ZO_SGD[:,:], itr_gd_ZO)

# ZO NAG

In [15]:
plot_result(itr_ZO_NAG[:,:], itr_gd_ZO)

### Acknowledgments
Thanks to Rob Raymond for the solution for the change of color on one trace
https://stackoverflow.com/questions/69705455/plotly-one-line-different-colors