In [15]:
import numpy as np

from scipy.optimize import minimize

#Gradient of Sum of L2 Norm Distance#

Let the list of points be \( \{x_1, x_2, \dots, x_n\} \) and the variable \( y \) be the point that minimizes the sum of L2 norms. The objective function can be written as:

$$
f(y) = \sum_{i=1}^{n} \|y - x_i\|_2
$$

To find the gradient, we consider the derivative of \( f(y) \) with respect to \( y \). The gradient is given by:

$$
\nabla f(y) = \sum_{i=1}^{n} \frac{y - x_i}{\|y - x_i\|_2}
$$

This gradient will guide the optimization process to minimize the sum of L2 norms.


## Gradient of the Sum of L2 Norm Distance

Let the list of points be denoted as $\{x_1, x_2, \ldots, x_n\}$, where each $x_i \in \mathbb{R}^d$. We aim to find a point $y \in \mathbb{R}^d$ that minimizes the sum of the L2 norms of the distances between $y$ and each $x_i$. The objective function can be written as:

$$
f(y) = \sum_{i=1}^n \|y - x_i\|_2
$$

### Step 1: Expanding the L2 Norm
The L2 norm $\|y - x_i\|_2$ is defined as:

$$
\|y - x_i\|_2 = \sqrt{(y - x_i)^\top (y - x_i)}
$$

Substituting into $f(y)$:

$$
f(y) = \sum_{i=1}^n \sqrt{(y - x_i)^\top (y - x_i)}
$$

### Step 2: Gradient of the Objective Function
The gradient of $f(y)$ with respect to $y$ is given by:

$$
\nabla f(y) = \sum_{i=1}^n \nabla \|y - x_i\|_2
$$

Using the chain rule, the gradient of $\|y - x_i\|_2$ is:

$$
\nabla \|y - x_i\|_2 = \frac{y - x_i}{\|y - x_i\|_2}
$$

Thus, the gradient of $f(y)$ becomes:

$$
\nabla f(y) = \sum_{i=1}^n \frac{y - x_i}{\|y - x_i\|_2}
$$

### Step 3: Induction of the Minimizer
To minimize $f(y)$, we solve for $y$ such that $\nabla f(y) = 0$:

$$
\sum_{i=1}^n \frac{y - x_i}{\|y - x_i\|_2} = 0
$$

This condition implies that the optimal $y$ is a weighted median of the points $\{x_i\}$, where the weights depend inversely on the distances $\|y - x_i\|_2$.

### Conclusion
The gradient $\nabla f(y)$ provides the direction to adjust $y$ iteratively to minimize the sum of L2 norms. The minimization leads to a point $y$ that balances the distances to all points $x_i$.


In [10]:
def objective_function(y, points):
    # square of l2 distance
    return np.sum(np.linalg.norm((points - y)**2, axis=1))

points = np.array([[1, 2], [3, 4], [5, 6]])

initial_point = points[0]
print(f"initial_point: {initial_point}")
loss = objective_function(initial_point, points)
results = minimize(fun=objective_function, x0=initial_point, args=(points, ))
print(f"final point: {results.x}")

initial_point: [1 2]
final point: [3.00000004 4.00000004]


In [13]:
def objective_function(y, points):
    # l2 distance
    return np.sum(np.linalg.norm(points - y, axis=1))

points = np.array([[1, 2], [3, 4], [5, 6]])


initial_point = points[0]
print(f"initial_point: {initial_point}")
loss = objective_function(initial_point, points)
results = minimize(fun=objective_function, x0=initial_point, args=(points, ))
print(f"final point: {results.x}")

initial_point: [1 2]
final point: [2.99999999 3.99999999]


In [None]:
class Loss:

    def __call__(self, y, X):

        """
        l = |(y - X)|^(1/2)
        dl/dy = 
        (1/2) *  |(y-X)|**2 / |(y - X)|
        """

        val = np.sum(np.linalg.norm(y - X, axis=1))
        return val
        
    def backward(self):
        

\#
Step-by-Step Induction for the Gradient of Sum of L2 Norms
\#

Let the list of points be \( \{x_1, x_2, \dots, x_n\} \) and the variable \( y \) be the point that minimizes the sum of L2 norms. The objective function is:

$$
f(y) = \sum_{i=1}^n \|y - x_i\|_2
$$

### Step 1: Expand the L2 Norm
The L2 norm for a point \( x_i \) is defined as:

$$
\|y - x_i\|_2 = \sqrt{\sum_{j=1}^d (y_j - x_{i,j})^2}
$$

where \( y = (y_1, y_2, \dots, y_d) \) and \( x_i = (x_{i,1}, x_{i,2}, \dots, x_{i,d}) \).

Thus, the function \( f(y) \) becomes:

$$
f(y) = \sum_{i=1}^n \sqrt{\sum_{j=1}^d (y_j - x_{i,j})^2}
$$

### Step 2: Differentiate the L2 Norm
To find the gradient of \( f(y) \), we differentiate each term \( \|y - x_i\|_2 \) with respect to \( y \). Using the chain rule:

$$
\frac{\partial}{\partial y_j} \|y - x_i\|_2 = \frac{\partial}{\partial y_j} \sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2} 
= \frac{1}{2 \sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2}} \cdot 2(y_j - x_{i,j})
$$

This simplifies to:

$$
\frac{\partial}{\partial y_j} \|y - x_i\|_2 = \frac{y_j - x_{i,j}}{\|y - x_i\|_2}
$$

### Step 3: Gradient for All Dimensions
The gradient of \( \|y - x_i\|_2 \) with respect to \( y \) (a vector) is then:

$$
\nabla \|y - x_i\|_2 = \frac{y - x_i}{\|y - x_i\|_2}
$$

### Step 4: Sum Over All Points
Finally, the gradient of \( f(y) \) is the sum of the gradients of each term:

$$
\nabla f(y) = \sum_{i=1}^n \nabla \|y - x_i\|_2 = \sum_{i=1}^n \frac{y - x_i}{\|y - x_i\|_2}
$$

Thus, the gradient of the sum of L2 norms is:

$$
\nabla f(y) = \sum_{i=1}^n \frac{y - x_i}{\|y - x_i\|_2}
$$


\begin{aligned}
\text{To compute the derivative, the following steps are missing:} \\

\text{1. Expand the derivative explicitly:} \\
\frac{\partial}{\partial y_j} \sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2} 
= \frac{\partial}{\partial y_j} \Big( \big( \sum_{k=1}^d (y_k - x_{i,k})^2 \big)^{\frac{1}{2}} \Big)

\\
\text{2. Apply the chain rule:} \\
\frac{\partial}{\partial y_j} \Big( \big( \sum_{k=1}^d (y_k - x_{i,k})^2 \big)^{\frac{1}{2}} \Big)
= \frac{1}{2} \Big( \sum_{k=1}^d (y_k - x_{i,k})^2 \Big)^{-\frac{1}{2}} \cdot \frac{\partial}{\partial y_j} \Big( \sum_{k=1}^d (y_k - x_{i,k})^2 \Big)

\\
\text{3. Simplify the derivative of the inner sum:} \\
\frac{\partial}{\partial y_j} \Big( \sum_{k=1}^d (y_k - x_{i,k})^2 \Big)
= 2(y_j - x_{i,j})

\\
\text{4. Substitute the simplified derivative back:} \\
\frac{\partial}{\partial y_j} \sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2} 
= \frac{1}{2} \Big( \sum_{k=1}^d (y_k - x_{i,k})^2 \Big)^{-\frac{1}{2}} \cdot 2(y_j - x_{i,j})

\\
\text{5. Final simplification:} \\
\frac{\partial}{\partial y_j} \sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2} 
= \frac{(y_j - x_{i,j})}{\sqrt{\sum_{k=1}^d (y_k - x_{i,k})^2}}
\end{aligned}

In [14]:
import numpy as np

def gradient_descent_l2(points, y_init, learning_rate=0.01, max_iter=1000, tol=1e-6):
    """
    Perform gradient descent to minimize the sum of L2 norms.
    
    Args:
        points (numpy.ndarray): Array of shape (n, d), where n is the number of points and d is the dimension.
        y_init (numpy.ndarray): Initial guess for y, of shape (d,).
        learning_rate (float): Step size for gradient descent.
        max_iter (int): Maximum number of iterations.
        tol (float): Tolerance for stopping criterion.
        
    Returns:
        numpy.ndarray: The optimal y that minimizes the sum of L2 norms.
    """
    y = y_init.copy()
    for i in range(max_iter):
        # Compute the gradient
        gradient = np.sum([(y - x) / np.linalg.norm(y - x) for x in points], axis=0)
        
        # Update y
        y_new = y - learning_rate * gradient
        
        # Check for convergence
        if np.linalg.norm(y_new - y) < tol:
            print(f"Converged in {i+1} iterations.")
            break
        
        y = y_new
    
    return y

# Example usage
points = np.array([[1, 2], [3, 4], [5, 6]])  # List of points (n=3, d=2)
y_init = np.array([0.0, 0.0])  # Initial guess for y
optimal_y = gradient_descent_l2(points, y_init)
print("Optimal y:", optimal_y)


Optimal y: [2.99660991 4.00347191]
