# Imports, config and setting global variables

In [None]:
import os
import numpy as np
import pandas as pd
import plotly.express as px

# from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [53]:
import logging

if not os.path.exists('logs'):
    os.makedirs('logs')

LOGGING_FILE = os.path.join('logs', 'experiments.log')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', 
                    datefmt='%Y-%m-%d %H:%M:%S', 
                    handlers=[logging.StreamHandler(), 
                              logging.FileHandler(LOGGING_FILE)])


In [102]:
N_SAMPLES = 100_000

MIN_LAMBDA_LOG_10 = -4
MAX_LAMBDA_LOG_10 = 4
NUM_LAMBDAS = 500

logging.info(f'Generating {N_SAMPLES} samples')
logging.info(f'''Using {NUM_LAMBDAS} lambdas equally spaced in log space from 10^(-{MIN_LAMBDA_LOG_10}) to 10^{MAX_LAMBDA_LOG_10}''')

LAMBDAS = np.logspace(MIN_LAMBDA_LOG_10, MAX_LAMBDA_LOG_10, NUM_LAMBDAS)


2024-12-02 23:42:22 - INFO - Generating 100000 samples
2024-12-02 23:42:22 - INFO - Using 500 lambdas equally spaced in log space from 10^(--4) to 10^4


# Defining Data Generating Processes 

## (Slide 327/337) y = 3x_1 - 2x_2 + epsilon. epsilon ~ N(0, 1)

In [79]:
X = np.random.randn(N_SAMPLES, 2)
y = 3*X[:,0] + 2*X[:,1] + np.random.randn(N_SAMPLES)

## Others - ToDo

# Finding thetas

In [80]:
def get_thetas(X, y, lmbda):
    """
    Computes the optimal thetas for the linear regression problem with L2 regularization.
    
    Args:
        X (np.ndarray): Data (intercept column will be added later)
        y (np.ndarray): Target
        lmbda (float): Regularization parameter
    
    ToDO:
    - Here we also penalize the intercept term, I'm not sure if this is what we 
    want for our experiments. Need to figure this our (or preferably, try both)
    
    Note:
        - (This is unrelated to our goal but may be interesting): 
        performing QR decomposition before solving the normal equations did not
        provide any speedup even for large datasets. (We had just learned about
        this from Optimization course and I wanted to benchmark it)
    """
    n_samples, n_features = X.shape
    X_design = np.c_[np.ones((n_samples, 1)), X]
    
    LHS = X_design.T.dot(X_design) + lmbda*np.eye(n_features + 1)
    RHS = X_design.T.dot(y)
    
    return np.linalg.solve(LHS, RHS)

# Running the experiments

In [82]:
beta_dict = {
    lmbda: get_thetas(X, y, lmbda) for lmbda in LAMBDAS
}

In [None]:
df = pd.DataFrame(beta_dict).T

df.reset_index(inplace=True)
df.rename(columns={"index":"lambda", 0: 'intercept', 1: 'beta_1', 2: 'beta_2'}, 
          inplace=True)

df["L1_norm"] = df[['intercept', 'beta_1', 'beta_2']].apply(lambda x: 
    np.linalg.norm(x, ord=1), axis=1)
df["L2_norm"] = df[['intercept', 'beta_1', 'beta_2']].apply(lambda x: 
    np.linalg.norm(x, ord=2), axis=1)	


Unnamed: 0,lambda,intercept,beta_1,beta_2,L1_norm,L2_norm
0,0.000100,-0.001231,3.000149,1.998213,4.999593,3.604684
1,0.000104,-0.001231,3.000149,1.998213,4.999593,3.604684
2,0.000108,-0.001231,3.000149,1.998213,4.999593,3.604684
3,0.000112,-0.001231,3.000149,1.998213,4.999593,3.604684
4,0.000116,-0.001231,3.000149,1.998213,4.999593,3.604684
...,...,...,...,...,...,...
495,8627.237292,-0.000001,2.763865,1.840705,4.604572,3.320715
496,8951.664721,0.000038,2.755704,1.835265,4.591007,3.310907
497,9288.292250,0.000078,2.747286,1.829654,4.577019,3.300790
498,9637.578664,0.000119,2.738606,1.823869,4.562594,3.290359


# Plotting the results

In [97]:

fig = px.scatter(df, x='lambda', y=['L2_norm'], 
                 title='L2 norm of the coefficients (huge range for lambdas)', 
                 labels={'value': 'Norm', 'variable': 'Lambda'})
fig.show()

In [101]:
# set xlim to 0, 1
fig.update_layout(title='L2 norm of the coefficients (small range for lambdas)')

fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[3.6, 3.65])
fig.show()

# Conclusion

This DGP doesn't really let us observe any interesting results, I'll try it out with 
more complex DGPs and see if we can observe anything