In [1]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
from sklearn.metrics import mean_squared_error
from random import random, seed
import sys
sys.path.append("../")
import functions as f
plt.style.use('seaborn-v0_8-whitegrid')
#plt.style.available
import seaborn as sns
import load_data as ld
import classes as cl

In [2]:
# Load simple data
X, y, x, a_true = ld.load_simple_data(100, 0.1)
# Initial beta
beta_init = np.random.randn(X.shape[1])
print(X.shape[1])
print(a_true)

3
[0.88114956 0.82005649 0.73665888]


In [3]:
betaOLS = f.beta_OLS(X, y)
betaRidge = f.beta_Ridge(X, y, 0.5)
print(beta_init)
print(betaOLS)
print(betaRidge)

[-1.22541231  1.12483019 -1.18774191]
[0.88378481 0.82957288 0.72425269]
[0.87919261 0.82554568 0.72457299]


In [4]:
#Task 1) plain gradient descent with and without ridge
#OLS)
gd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=100, optimizer='gd', gradient_method='analytical', lambda_param=0.1)
optimized_gd_beta = gd.optimize()
print(optimized_gd_beta)
ygd = optimized_gd_beta[0] + optimized_gd_beta[1]*x + optimized_gd_beta[2]*x**2
print(mean_squared_error(y, ygd))
#Ridge)
rgd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=100, optimizer='gd', gradient_method='analytical', lambda_param=0.1, cost_function='ridge')
optimized_rgd_beta = rgd.optimize()
print(optimized_rgd_beta)
yrgd = optimized_rgd_beta[0] + optimized_rgd_beta[1]*x + optimized_rgd_beta[2]*x**2
print(mean_squared_error(y, yrgd))

[0.26089698 0.95388038 0.89253865]
0.2731105568226006
[0.27542128 0.89161104 0.86313741]
0.2641180693645645


In [5]:
#task 2) momentum gradient descent with and without ridge and comparison to plain gradient descent
#OLS)
mgd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=100, optimizer='gd', gradient_method='analytical', lambda_param=0.1, momentum = 0.9)
optimized_mgd_beta = mgd.optimize()
print(optimized_mgd_beta)
ymgd = optimized_mgd_beta[0] + optimized_mgd_beta[1]*x + optimized_mgd_beta[2]*x**2
print(mean_squared_error(y, ymgd))
#Ridge)
rmgd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=100, optimizer='gd', gradient_method='analytical', lambda_param=0.1, cost_function='ridge', momentum = 0.9)
optimized_rmgd_beta = rmgd.optimize()
print(optimized_rmgd_beta)
yrmgd = optimized_rmgd_beta[0] + optimized_rmgd_beta[1]*x + optimized_rmgd_beta[2]*x**2
print(mean_squared_error(y, yrmgd))

#comparing with the above loss values, we clearly see faster convergence

[0.8888034  0.82885954 0.72834302]
0.01127364104208621
[0.80079571 0.75458433 0.72936325]
0.023232293923448836


In [None]:
#task 3) repeat for stochastic gradient descent and dicuss results with respect to batch size, number of epochs etc

In [6]:


mgd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=1000, optimizer='gd', gradient_method='analytical', lambda_param=0.1, momentum = 0.9)
optimized_mgd_beta = mgd.optimize()

agd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=1000, optimizer='adagrad', gradient_method='analytical', lambda_param=0.1)
optimized_agd_beta = agd.optimize()

magd = cl.GradientDescent(X, y, beta_init, learning_rate=0.01, epochs=1000, optimizer='adagrad', gradient_method='analytical', lambda_param=0.1, momentum = 0.9)
optimized_magd_beta = magd.optimize()

sgd = cl.StochasticGradientDescent(X, y, beta_init, learning_rate=0.01, epochs=1000, optimizer='adam', gradient_method='analytical')
optimized_sgd_beta = sgd.optimize()

print(optimized_mgd_beta)
print(optimized_agd_beta)
print(optimized_magd_beta)
print(optimized_sgd_beta)

[0.88390167 0.82956332 0.72419535]
[0.88378481 0.82957288 0.72425269]
[0.88378481 0.82957288 0.72425269]
[0.88378481 0.82957288 0.72425269]
[0.89224839 0.82003571 0.752102  ]


In [7]:
ymgd = optimized_mgd_beta[0] + optimized_mgd_beta[1]*x + optimized_mgd_beta[2]*x**2
yagd = optimized_agd_beta[0] + optimized_agd_beta[1]*x + optimized_agd_beta[2]*x**2
ymagd = optimized_magd_beta[0] + optimized_magd_beta[1]*x + optimized_magd_beta[2]*x**2


ysgd = optimized_sgd_beta[0] + optimized_sgd_beta[1]*x + optimized_sgd_beta[2]*x**2

print(mean_squared_error(y, ymgd))
print(mean_squared_error(y, yagd))
#adagrad converges slower. For higher epochs it converges similarly to the others
print(mean_squared_error(y, ymagd))
print(mean_squared_error(y, ysgd))

0.011165065284857561
0.01116505656129741
0.011165056561297406
0.01116505656129741
0.0137657053119865


In [None]:
#plotting
sort_inds = np.argsort(x)
plt.plot(x[sort_inds], y[sort_inds], label='Datapoints')
plt.plot(x[sort_inds], ygd[sort_inds], label='Gradient Descent')
plt.plot(x[sort_inds], ysgd[sort_inds], label='Stochastic Gradient Descent')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()