## Newton methods

#### Problem:
$$
f(\vec{x}) \rightarrow min,\\
f: \Omega \rightarrow \mathbb{R}, \\
\Omega \subset \mathbb{R^n}, f(\vec{x}) \mbox{ is convex}, \\
f(\vec{x}) \mbox{ - is twice diffirentiable on } \Omega\\
\vec{x_*} \in \Omega, f_{min} = f(\vec{x_*})
$$

We can greater efficency of finding $x_*$ if we use information not only about function gradient, but also Hessian $H(\vec{x})$

In simple variant on every k iteration function is approximated in neighborhood of point $\vec{x}_{k-1}$ by quadratic function $\phi_{k}(x)$ then $\vec{x}_k$ is found and the procedure continiues

By using Tailor series we can represent our function in neighborhood of point $x_{k}$ as
$$
f(\vec{x}) = f(\vec{x}_{k} + (\nabla f(\vec{x}_k), \vec{x} - \vec{x}_k) + \frac{1}{2}(H(\vec{x}_k)(\vec{x} - \vec{x}_k), \vec{x} - \vec{x}_k) + o(|\vec{x} - \vec{x}_k|)
$$

So our quadratic approximation $\phi_k(\vec{x})$ would be
$$
\phi_{k+1}(\vec{x}) = f(\vec{x}_{k} + (\nabla f(\vec{x}_k), \vec{x} - \vec{x}_k) + \frac{1}{2}(H(\vec{x}_k)(\vec{x} - \vec{x}_k), \vec{x} - \vec{x}_k)
$$

If our $H(\vec{x}_{k})$ is positive determined (function is convex), then $\vec{x}_{k+1}$ is single minimum of quadratic approximation and can be found using: 
$$ 
\nabla \phi_{k+1}(\vec{x}) = \nabla f(\vec{x}_k) + H(\vec{x}_k)(\vec{x} - \vec{x}_k) = \vec{0}
$$

Then we get
$$
\vec{x}_{k+1} = \vec{x}_{k}  - H^{-1}(\vec{x}_{k}) \nabla f(\vec{x}_{k})
$$

If our dimension number $n$ of space $\mathbb{R}$ is big enough, then finding $H^{-1}$ is very big problem. In this case it expedient to find minimum of $\phi_k(\vec{x})$ by using **gradient methods** or **conjugate directions method**
$\widetilde{\vec{x}}_k = argmin\{\phi_{k}(\vec{x})\}$ is just an approximation, using this we can build *relaxational sequence* 

$$
\vec{x}_{k} = \vec{x}_{k-1} + \lambda_k(\widetilde{\vec{x}}_{k} - \vec{x}_{k-1}) = \vec{x}_{k-1} + \lambda_k\vec{p}_{k} \\ 
\vec{p}_k = -H^{-1}(\vec{x}_{k-1}) \nabla f(\vec{x}_{k-1}) \mbox{ - direction of descent}
$$

We can find $\lambda_k$ different ways, for example find $argmin\{f(\vec{x}_{k-1} + \lambda_k\vec{p}_k\}$ or by method of step splitting


In [30]:
import matplotlib as mplib
import math as m
import numpy as np
from numpy.linalg import norm
from scipy import linalg
from scipy import sparse

from onedim_optimize import quadratic_approx, newton_method, fibbonaci_method, middle_point_method, newton_modified
from scipy.optimize import approx_fprime

def toOneParamFunc(f, x, w):
    return lambda p: f(x + p*w) 

def argmin(f, a, b, eps):
    fmin, xmin, k = middle_point_method(f, a, b, eps)
    return xmin, k

def approx_gradient(f, eps):
    return lambda x: approx_fprime(x, f, eps)

def partial_der(f_dx, f_x, dx, j):
    p = np.divide(f_dx - f_x, dx)
    return p

def hessian_in_point(x, f, grad, eps):
     gr = grad(x)
     n = len(gr) 
     hes = []
     for i in range (0, n):
        x_delta = np.array(x[:])
        x_delta[i] = x_delta[i] + eps
        gr_delta = grad(x_delta)
        partials = np.array([partial_der(gr_delta[j], part, eps, j) for j,part in enumerate(gr)])
        hes.append(partials)
     return np.array(hes)

def hessian(f, grad, eps):
    return lambda x: hessian_in_point(x, f, grad, eps) 

In [203]:
f = lambda x: np.float64(x[0] + 2*x[1] + 4*m.sqrt(1 + x[0]**2 + x[1]**2))
eps = np.float64(1e-6)
x = np.array([1.0, 1.0])

hes = hessian(f, approx_gradient(f, eps), eps)
print(hes(x))

[[ 1.54010138 -0.76916251]
 [-0.76916251  1.53832502]]


### Newton method

The common Newton method is to find our $H^{-1}$ matrix and than build relaxetion sequence by this rule:
$$
\vec{x}_{k+1} = \vec{x}_{k}  - H^{-1}(\vec{x}_{k}) \nabla f(\vec{x}_{k})
$$

But there is a problem with matrix $H$, it needs to be always positive determinated or $H^{-1}$ won't exist.

To solve this problem, let's check if $H$ is positive determinated, if not, then let's pick $\eta$_k, such that:
$$
\widetilde{H}_k = \eta_kI_n + H(\vec{x}_{k-1})
$$
$\widetilde{H}$ is positive determinated matrix, that we pick instead of $H$


In [14]:
def common_newton(f, gr, hess, x, epsilon):
    w = -gr(x)
    k = 0
    while(norm(w) > epsilon):
        H = hess(x)
        print(x, w)
        print(H)
        h = linalg.solve(H, w)
        x = x + h
        w = -gr(x)
        k += 1
    return f(x), x, k

In [15]:
f2 = lambda x: (x[0]**2 - x[1])**2 + (x[0] - 1)**2
danilov = lambda x: x[0] + 2*x[1] + 4*m.sqrt(1 + x[0]**2 + x[1]**2)

test2 = [
    f2,
    approx_gradient(f2, 1e-8),
    hessian(f2, approx_gradient(f2, 1e-6), 1e-6),
    np.array([-1, -2]),
    0.001,
]

test_danilov = [
    danilov,
    approx_gradient(danilov, np.float64(1e-8)),
    hessian(danilov, approx_gradient(danilov, np.float64(1e-6)), np.float64(1e-6)),
    np.array([-2, -1]),
    0.01,
]

fmin, xmin, K = common_newton(*test2)
print(f"""
x minimum: {xmin},
f minimum: {fmin},
number of iterations: {K}
""")

[-1 -2] [15.99999982  6.        ]
[[13999994.0009634   2000000.00133227]
 [ 3999998.00026629  2000000.00088818]]
[-0.999999 -1.999999] [15.99997424  5.99999414]
[[21.9984031   4.0003556 ]
 [ 4.0003556   2.00373051]]
[-0.71300944  0.4214513 ] [3.67394975 0.1738623 ]
[[6.41531273 2.85194091]
 [2.85194091 2.0001778 ]]
[ 0.74557234 -1.57133491] [-5.83510961  4.25442608]
[[14.95337187 -2.98427949]
 [-2.98427949  2.0001778 ]]
[0.79438043 0.62851106] [0.40320246 0.00505842]
[[ 7.05846492 -3.17752769]
 [-3.17752769  1.99999045]]
[0.99896687 0.95608136] [-0.16517459  0.08370687]
[[10.15092177 -3.99586961]
 [-3.99586961  1.99999999]]

x minimum: [0.9999202  0.99983949],
f minimum: 6.368974403837277e-09,
number of iterations: 6



In [9]:
def newton_upgraded(f, gr, hess, x, epsilon):
    w = -gr(x)
    phi = toOneParamFunc(f, x, w)
    k = 0
    n = 0
    while(norm(w) > epsilon):
        H = hess(x)
        print(x, w)
        print(H)
        h = linalg.solve(H, w)
        phi = toOneParamFunc(f, x, h)
        l, i = argmin(phi, 0, 1, epsilon) 
        n += i
        x = x + l * h
        w = -gr(x)
        k += 1
    return f(x), x, k, n

In [12]:
f2 = lambda x: (x[0]**2 - x[1])**2 + (x[0] - 1)**2
danilov = lambda x: x[0] + 2*x[1] + 4*m.sqrt(1 + x[0]**2 + x[1]**2)

test2 = [
    f2,
    approx_gradient(f2, np.float64(1e-8)),
    hessian(f2, approx_gradient(f2, np.float64(1e-6)), np.float64(1e-6)),
    np.array([-1, -2]),
    np.float64(1e-3),
]

test_danilov = [
    danilov,
    approx_gradient(danilov, np.float64(1e-8)),
    hessian(danilov, approx_gradient(danilov, np.float64(1e-8)), np.float64(1e-8)),
    np.array([-1, -2]),
    0.01,
]

fmin, xmin, K, N = newton_upgraded(*test2)
print(f"""
x minimum: {xmin},
f minimum: {fmin},
number of iterations: {K},
number of one-dimension minimization iterations: {N}
""")

[-1 -2] [15.99999982  6.        ]
[[13999994.0009634   2000000.00133227]
 [ 3999998.00026629  2000000.00088818]]

x minimum: [nan nan],
f minimum: nan,
number of iterations: 1,
number of one-dimension minimization iterations: 10



  f2 = lambda x: (x[0]**2 - x[1])**2 + (x[0] - 1)**2


In [28]:
def count_next_matrix(A, dw,dx):
    y = dx
    z = A.dot(dw)
    first_part = np.dot(np.transpose([y]), [dx]) * np.divide(1, dw.dot(y))
    sec_part =   np.dot(np.transpose([z]), [A.dot(dw)]) * np.divide(1, dw.dot(z))
    return A - first_part - sec_part

def DFP(f, grad, x, epsilon):
    w2 = -grad(x)
    phi = toOneParamFunc(f, x, w2)
    x1 = x
    k = 1
    n = 0
    A = np.identity(len(x))
    l, i = argmin(phi, 0, 1, epsilon)
    n += i
    x2 = x1 + l * w2
    print(A)
    while(norm(w2) > epsilon):
        w1 = w2
        w2 = -grad(x2)
#         if k % len(x) != 0 :
        print(A, x2)
        A = count_next_matrix(A, w2 - w1, x2 - x1)
#         else:
#             A = np.identity(len(x))
        p = A.dot(w2)
        x1 = x2
        phi = toOneParamFunc(f, x2, p)
        l, i = argmin(phi, 0, 1, epsilon)
        x2 = x1 + l * p
        k += 1
        n += i
    return f(x), x, k, n
        

In [31]:
f2 = lambda x: (x[0]**2 - x[1])**2 + (x[0] - 1)**2
danilov = lambda x: x[0] + 2*x[1] + 4*m.sqrt(1 + x[0]**2 + x[1]**2)

test2 = [
    f2,
    approx_gradient(f2, np.float64(1e-8)),
    np.array([-1, -2]),
    np.float64(0.001),
]

test_danilov = [
    danilov,
    approx_gradient(danilov, np.float64(1e-8)),
    np.array([-1, -2]),
    0.01,
]

fmin, xmin, K, N = DFP(*test2)
print(f"""
x minimum: {xmin},
f minimum: {fmin},
number of iterations: {K},
number of one-dimension minimization iterations: {N}
""")

TypeError: unsupported operand type(s) for -: 'float' and 'function'