# Assignment-1 : Mohamed Alaa Ali Mahmoud , SEC2 , BN 19

## Importing the necessary modules and setting a seed

In [None]:
from random import Random
import numpy as np
from matplotlib.pyplot import figure
from math import sqrt
SEED = 5

## Generating a set of random points in the 2 dimentional space

In [None]:
def gen_rand_pts(N=1000):
    """
    returns 2 lists where the first one contains the x-coordinates of the points generated randomly,
    and the second one contains the y-coordinates of the points generated randomly
    :param N: (int, optional) default = 1000,
        number of random generated points
    :return: tuple of 2 lists
    """
    if N<=0:
        raise ValueError("Wrong value for N")
    rand_gen = Random(x=SEED)
    return (
        [rand_gen.uniform(a=0, b=1) for _ in range(N)],
        [rand_gen.uniform(a=0, b=1) for _ in range(N)]
    )

## Evaluating the loss function

In [None]:
def loss(data_x, data_y, x_p, y_p):
    """
takes the data set and the initial guess and returns the evaluation of our loss function
:param data_x:(list of floats) : x-coordinate of the data point for the training data
:param data_y:(list of floats) : y-coordinate of the data point for the training data
:param x_p:(float) : x-coordinate of the data point for the initial guess
:param y_p:(float) : y-coordinate of the data point for the initial guess
:return: loss (float): The root mean squared distance between
    the point (x_p, y_p) and the data points
"""
    n_inv = 1/len(data_x)
    return n_inv * sum(
        [ ((x_i-x_p)**2 + (y_i-y_p)**2)**0.5 for x_i, y_i in zip(data_x, data_y)]
    )

## Numeric Conceptualization of Derivatives
It comes for the main definition of the derivative
<br><br>
$\frac{\partial \mathbb{L}}{\partial x} = \lim_{h \to 0} \frac{\mathbb{L}(x+h) - \mathbb{L}(x)}{h}$
<br><br>
By replacing the asymptotically infinitesimal by pragmatically small of values we can get a definition of deriviative that can be numerically computed

In [None]:
def find_min(data_x:list, data_y:list, x_p=5, y_p=5, h=0.001, delta=0.01, epochs=3000):
    """
    Finds the minimum loss values for x and y parameters given a list of points (x_i, y_i)
    and a starting point (x_p, y_p) using gradient descent optimization.

    Parameters:
    data_x: list
        A list of x values from the dataset to be used as reference points.
    data_y: list
        The corresponding list of y values from the dataset to be used as reference points.
    x_p: float, default=5
        The starting value for the x parameter in the optimization.
    y_p: float, default=5
        The starting value for the y parameter in the optimization.
    h: float, default=0.001
        The step size for approximating the gradient.
    delta: float, default=0.01
        The learning rate for the gradient descent algorithm.
    epochs: int
        The number of times to iterate through the optimization algorithm.
    Returns:
    --------
    lst_xp: list of floats
        The optimized values for the x parameter.
    lst_yp: list of floats
        The optimized values for the y parameter.
    epoch_losses: list of floats
        A list of the loss values for each epoch during optimization.
    """
    epoch_losses=[]
    lst_xp = []
    lst_yp= []

    for _ in range(epochs):
        epoch_losses.append(loss(data_x=data_x,data_y=data_y, x_p=x_p,y_p=y_p))
        lst_xp.append(x_p)
        lst_yp.append(y_p)

        dloss_dx = (loss(data_x, data_y, x_p + h, y_p) - loss(data_x, data_y, x_p, y_p)) / h
        dloss_dy = (loss(data_x, data_y, x_p, y_p + h) - loss(data_x, data_y, x_p, y_p)) / h

        x_p -= delta * dloss_dx
        y_p -= delta * dloss_dy

    return epoch_losses, lst_xp, lst_yp

## Hyper Params Tuning
Here we tune the following hyper params to find the best fit for our data <br>
<ul>
    first case : xp , yp = negative <br>
    second case: xp, yp ~= 0.5, 0.5 <br>
    third case : h large number <br>
    fourth case : delta large number <br>
    fifth case : delta small number <br>
    Epochs always 3000
</ul>

In [None]:
def find_case(data_x:list, data_y:list, x_p=5, y_p=5, h=0.001, delta=0.01, epochs=3000):
    """
    gets the results for the case passedd
    """
    return find_min(data_x, data_y, x_p, y_p, h, delta, epochs)

In [None]:
data_x , data_y = gen_rand_pts()
xp = [-5, 0.48]
yp = [-4, 0.48]
h = 100
delta = [100, 0.00001]

In [None]:
losses1,xp_1,yp_1 = find_case(data_x=data_x, data_y=data_y, x_p=xp[0], y_p=yp[0])
losses2,xp_2,yp_2 = find_case(data_x=data_x, data_y=data_y, x_p=xp[1], y_p=yp[1])
losses3,xp_3,yp_3 = find_case(data_x=data_x, data_y=data_y, h=100)
losses4,xp_4,yp_4 = find_case(data_x=data_x, data_y=data_y, delta=delta[0])
losses5,xp_5,yp_5 = find_case(data_x=data_x, data_y=data_y, delta=delta[1])

In [None]:
fig = figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlim(left = -1, right = 2)
ax.set_ylim(top = -1, bottom = 2)
ax.scatter(data_x, data_y)
ax.set_title("Visualizing the data points")
ax.set_xlabel("X")
ax.set_ylabel("Y")

## Visualizing the first case x_p, y_p are negatives

In [None]:
fig2 = figure(figsize=(20,10))
ax2=fig2.add_subplot(1,2,1)
ax2.plot(losses1)
ax2.set_title("Loss versus Epochs Case 1")
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Loss")
ax2.text(800, 4,
             'We Notice that the curve of \n'
             'losses is decreasing with the epochs number\n'
             'asymptotically,\nwhere the value of the loss gets\n nearly constant around the 560 Epoch',
             style = 'italic',
             fontsize = 15,
             color = "green")
ax2.hlines(np.array(losses1).min()-0.01, xmin=0, xmax=3000, colors="r", linestyles="--")

ax3 = fig2.add_subplot(1,2,2)
ax3.plot(xp_1, label="x_p")
ax3.plot(yp_1, label="y_p")
ax3.legend()
ax3.set_title("X_P, Y_P where they are initialized negative values at the beginning versus Epochs Case 1")
ax3.set_xlabel("Epochs")
ax3.set_ylabel("X_P, Y_P")
ax3.text(700, -4,
         'We Notice that the curve of \n'
         'X_P and Y_P is increasing with the epochs number\n'
         'asymptotically\n,where the value of the X_P,Y_P gets\n nearly constant around the 560 Epoch\n'
         'and they overlap each other\n due to the symmetry of our problem',
         style = 'italic',
         fontsize = 15,
         color = "green")
ax3.hlines(np.array(xp_1).max()+0.01, xmin=0, xmax=3000, colors="r", linestyles="--")

## Visualizing the second case x_p, y_p are close to the ground truth

In [None]:
fig_case2 = figure(figsize=(20,10))
ax_case2 = fig_case2.add_subplot(1,2,1)
ax_case2.plot(losses2)
ax_case2.set_title("Loss versus Epochs Case 2")
ax_case2.set_xlabel("Epochs")
ax_case2.set_ylabel("Loss")
ax_case2.hlines(np.array(losses2).min()-0.000001, xmin=0, xmax=3000, colors="r", linestyles="--")

ax_case2_2 = fig_case2.add_subplot(1,2,2)
ax_case2_2.plot(xp_2, label="x_p")
ax_case2_2.plot(yp_2, label="y_p")
ax_case2_2.legend()
ax_case2_2.set_title("X_P, Y_P where they are initialized close to the ground truth values versus Epochs Case 2")
ax_case2_2.set_xlabel("Epochs")
ax_case2_2.set_ylabel("X_P, Y_P")

Here for X_P and Y_P the curves show there will always be a difference between them about 0.1 but we are still close to our best solution <br>
We also notice that the loss dropped very fast and this is predictable since we started from a guess which was very close to the grand truth <br>
so we converged in the first 300 epoch which is faster than the first time

## Visualizing Case 3 : h is set to a large number

In [None]:
fig_case3 = figure(figsize=(20,10))
ax_case3 = fig_case3.add_subplot(1,2,1)
ax_case3.plot(losses3)
ax_case3.set_title("Loss versus Epochs Case 3")
ax_case3.set_xlabel("Epochs")
ax_case3.set_ylabel("Loss")
ax_case3.hlines(np.array(losses3).min()-0.000001, xmin=0, xmax=3000, colors="r", linestyles="--")

ax_case3_2 = fig_case3.add_subplot(1,2,2)
ax_case3_2.plot(xp_3, label="x_p")
ax_case3_2.plot(yp_3, label="y_p")
ax_case3_2.legend()
ax_case3_2.set_title("X_P, Y_P where h is initialized to a large number versus Epochs Case 3")
ax_case3_2.set_xlabel("Epochs")
ax_case3_2.set_ylabel("X_P, Y_P")

This is a total mess due to the following reasons:<br>
    - In gradient descent, the step size of the update to the parameters is determined by the derivative of the loss function with respect to the parameters.
    - Since we set h to a very large number this means we are making the resulting values for the derivatives very small or even zero
    - so we are updating with a very small step, so we do not converge or even failing in convergence

## visualizing the fourth case DELTA is very large

In [None]:
fig_case4 = figure(figsize=(20,10))
ax_case4 = fig_case4.add_subplot(1,2,1)
ax_case4.plot(losses4)
ax_case4.set_title("Loss versus Epochs Case 4")
ax_case4.set_xlabel("Epochs")
ax_case4.set_ylabel("Loss")
ax_case4.hlines(np.array(losses4).min()-0.000001, xmin=0, xmax=3000, colors="r", linestyles="--")

ax_case4_2 = fig_case4.add_subplot(1,2,2)
ax_case4_2.plot(xp_4, label="x_p")
ax_case4_2.plot(yp_4, label="y_p")
ax_case4_2.legend()
ax_case4_2.set_title("X_P, Y_P where delta is initialized to a large number versus Epochs Case 4")
ax_case4_2.set_xlabel("Epochs")
ax_case4_2.set_ylabel("X_P, Y_P")

Again Very Messy Curves and also this was predictable<br>
    -Divergence in the loss curve: When the learning rate is too high, the weights can change too much with each update, causing the loss function to increase instead of decreasing. This results in the model's predictions becoming worse and worse over time, rather than improving.
    - This is called the problem of exploding gradients

## Visualizing the fifth case delta small number

In [None]:
fig_case5 = figure(figsize=(20,10))
ax_case5 = fig_case5.add_subplot(1,2,1)
ax_case5.plot(losses5)
ax_case5.set_title("Loss versus Epochs Case 5")
ax_case5.set_xlabel("Epochs")
ax_case5.set_ylabel("Loss")
ax_case5.hlines(np.array(losses5).min()-0.000001, xmin=0, xmax=3000, colors="r", linestyles="--")

ax_case5_2 = fig_case5.add_subplot(1,2,2)
ax_case5_2.plot(xp_5, label="x_p")
ax_case5_2.plot(yp_5, label="y_p")
ax_case5_2.legend()
ax_case5_2.set_title("X_P, Y_P where delta is initialized to a small number versus Epochs Case 5")
ax_case5_2.set_xlabel("Epochs")
ax_case5_2.set_ylabel("X_P, Y_P")

Slow convergence: When the learning rate is too small, the weights are updated very slowly, which can lead to slow convergence.

# Conclusion

From the previous analysis we found that : <br>
    -Setting the learning rate to a large number can cause **exploding gradients** problem <br>
    -Setting the learning rate to a very small number can cause the algorithm to take very long time to converge <br>
    -Setting the value of H to a very large number will lead to the **vanishing gradients** problem<br>
    -initialization can play a good role in making the learning algorithm converge faster <br>

So , its always the best to tune the hyperparameters using reasonable numbers to save time and effort since searching in a wrong direction can
cost a lot of time and effort<br>
Many searching techniques can be used to find the best combinations of parameters for the machine learning model such as :
**GridSearching** and **RandomizedSearching** to find the best params


