# Imports, config and setting global variables

In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [2]:
import logging

if not os.path.exists('logs'):
    os.makedirs('logs')

LOGGING_FILE = os.path.join('logs', 'experiments.log')

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', 
                    datefmt='%Y-%m-%d %H:%M:%S', 
                    handlers=[logging.StreamHandler(), 
                              logging.FileHandler(LOGGING_FILE)])


In [188]:
N_SAMPLES = 10_000

MIN_LAMBDA_LOG_10 = -6
MAX_LAMBDA_LOG_10 = 6
NUM_LAMBDAS = 100

logging.info(f'Generating {N_SAMPLES} samples')
logging.info(f'''Using {NUM_LAMBDAS} lambdas equally spaced in log space from 10^({MIN_LAMBDA_LOG_10}) to 10^{MAX_LAMBDA_LOG_10}''')

LAMBDAS = np.logspace(MIN_LAMBDA_LOG_10, MAX_LAMBDA_LOG_10, NUM_LAMBDAS)


2024-12-08 00:43:02 - INFO - Generating 10000 samples
2024-12-08 00:43:02 - INFO - Using 100 lambdas equally spaced in log space from 10^(-6) to 10^6


## Helper functions

In [189]:
def empirical_risk(X, y, theta):
    # In the slides we don't divide by num samples, but it did not matter
    return np.mean((y - X @ theta)**2)


In [190]:
def filter_lambda_closest(df, target_lambda):
    """Filter the dataframe to the row with the lambda closest to the target_lambda
    
    We need this for replicating slide 327 (theta getting pulled to the origin)
    """
    
    closest_row = df.iloc[(df['lambda'] - target_lambda).abs().argsort()[:1]]
    return closest_row

# Defining Data Generating Processes 

## (Slide 327/337) y = 3x_1 - 2x_2 + epsilon. epsilon ~ N(0, 1)

In [191]:
# 2 d uniform random variables
X = np.random.rand(N_SAMPLES, 2)

theta_true = np.array([3, -2])

y = X @ theta_true + np.random.randn(N_SAMPLES)

# Contour plots

### Contours of the unregularized loss function

In [192]:
min_theta, max_theta = -5, 5
values_for_each_theta = 50

theta_1_vals = np.linspace(min_theta, max_theta, values_for_each_theta)
theta_2_vals = np.linspace(min_theta, max_theta, values_for_each_theta)

theta_pairs = np.array(np.meshgrid(theta_1_vals, theta_2_vals)).T.reshape(-1, 2)

df_unreg_risk = pd.DataFrame(theta_pairs, columns=['theta_1', 'theta_2'])

In [193]:
df_unreg_risk["emp_risk"] = df_unreg_risk[['theta_1', 'theta_2']].apply(lambda x:
    empirical_risk(X, y, x), axis=1)

In [181]:
fig_cont_plot = go.Figure(data=go.Contour(
    z=df_unreg_risk.pivot(index='theta_2', columns='theta_1', values='emp_risk').values,
    x=theta_1_vals,
    y=theta_2_vals,
    colorscale='Viridis',
    showscale=False  # Remove the color bar
))

fig_cont_plot.update_layout(
    title='Contour plot of empirical risk',
    xaxis_title='theta_1',
    yaxis_title='theta_2'
)

# add horizontal and vertical lines at 0s
fig_cont_plot.add_shape(
    type='line',
    x0=min_theta, y0=0, x1=max_theta, y1=0,
    line=dict(color='white', width=1)
)

fig_cont_plot.add_shape(
    type='line',
    x0=0, y0=min_theta, x1=0, y1=max_theta,
    line=dict(color='white', width=1)
)

# add a point for the true theta
fig_cont_plot.add_trace(go.Scatter(
    x=[theta_true[0]], y=[theta_true[1]],
    mode='markers',
    marker=dict(size=10, color='red'),
    name=f'theta_true = {theta_true}'
))

# fig_cont_plot.add_trace(go.Scatter(
#     x=[],
#     mode='markers',
#     marker=dict(size=10, color='white'),
#     name=f'theta_true = {theta_true}'
# ))


# fig.show()

# Finding thetas

In [182]:
def get_thetas(X, y, lmbda, add_intercept=False):
    """
    Computes the optimal thetas for the linear regression problem with L2 regularization.
    
    Args:
        X (np.ndarray): Data (intercept column will be added later)
        y (np.ndarray): Target
        lmbda (float): Regularization parameter
        add_intercept (bool): If True, an intercept column will be added to X
    
    ToDO:
    - Here we also penalize the intercept term, I'm not sure if this is what we 
    want for our experiments. Need to figure this our (or preferably, try both)
    
    Note:
        - (This is unrelated to our goal but may be interesting): 
        performing QR decomposition before solving the normal equations did not
        provide any speedup even for large datasets. (We had just learned about
        this from Optimization course and I wanted to benchmark it)
    """
    if add_intercept:
        X = np.c_[np.ones((n_samples, 1)), X]

    n_samples, n_features = X.shape

    LHS = X.T.dot(X) + lmbda*np.eye(n_features)
    RHS = X.T.dot(y)
    
    return np.linalg.solve(LHS, RHS)

In [183]:
theta_analytic = get_thetas(X, y, 0, add_intercept=False)

# assert dist between theta_analytic and theta_true is small
assert np.linalg.norm(theta_analytic - theta_true) < 1e-1, f'Analytic solution is quite far {theta_analytic} != {theta_true}'

# Running the experiments

In [184]:
theta_dict = {
    lmbda: get_thetas(X, y, lmbda) for lmbda in LAMBDAS
}

In [185]:
df_thetas_per_lambda = pd.DataFrame(theta_dict).T

df_thetas_per_lambda.reset_index(inplace=True)


df_thetas_per_lambda.rename(columns={"index": "lambda", 0: 'theta_1', 1: 'theta_2'}, 
          inplace=True)
df_thetas_per_lambda["L1_norm"] = df_thetas_per_lambda[['theta_1', 'theta_2']].apply(lambda x: 
    np.linalg.norm(x, ord=1), axis=1)
df_thetas_per_lambda["L2_norm"] = df_thetas_per_lambda[['theta_1', 'theta_2']].apply(lambda x: 
    np.linalg.norm(x, ord=2), axis=1)	


### Adding thetas to the contour plot

In [186]:
lambdas_do_display = [0.1, 1, 2.5, 5, 10, 20, 100, 1000]

for lmbda in lambdas_do_display:
    df_thetas_per_lambda_lmbda = filter_lambda_closest(df_thetas_per_lambda, lmbda)
    fig_cont_plot.add_trace(go.Scatter(
        x=df_thetas_per_lambda_lmbda['theta_1'],
        y=df_thetas_per_lambda_lmbda['theta_2'],
        mode='markers',
        marker=dict(size=10, color='green'),
        name=f'lambda={lmbda}'
    ))
    
fig_cont_plot.show()

# Plotting the results

In [187]:

fig_rel = px.scatter(df_thetas_per_lambda, x='lambda', y=['L2_norm'], 
                 title='L2 norm of the coefficients (huge range for lambdas)', 
                 labels={'value': 'Norm', 'variable': 'Lambda'},
                 log_x=True)
fig_rel.show()

### Relationship type

In [103]:
df_thetas_per_lambda

Unnamed: 0,lambda,theta_1,theta_2,L1_norm,L2_norm
0,0.001000,2.977975,-1.985650,4.963625,3.579265
1,0.001150,2.977974,-1.985649,4.963624,3.579265
2,0.001322,2.977974,-1.985649,4.963623,3.579264
3,0.001520,2.977973,-1.985648,4.963622,3.579263
4,0.001748,2.977973,-1.985648,4.963620,3.579262
...,...,...,...,...,...
95,572.236766,1.922913,-1.027402,2.950315,2.180172
96,657.933225,1.832132,-0.949165,2.781298,2.063401
97,756.463328,1.739218,-0.870188,2.609406,1.944764
98,869.749003,1.644935,-0.791334,2.436269,1.825383


It doesn't make much sense to look into Spearman and Kendall Tao since the relationship would always be monotonic 

In [141]:
import numpy as np
from sklearn.metrics import r2_score
from scipy.stats import linregress, pearsonr

def evaluate_relationships(X, Y, relationships=["linear", "quadratic", "cubic", "log", "exp", "sqrt"]):
    """
    Evaluate the strength of various relationships between two features using R^2 scores.
    
    Returns:
        dict: A dictionary where keys are relationship names and values are R^2 scores.
    """
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html
    results = {}
    
    def fit_and_score(transform_X=None, transform_Y=None, degree=None):
        transformed_X = transform_X(X) if transform_X else X
        transformed_Y = transform_Y(Y) if transform_Y else Y
                
        if degree:  # Polynomial fit
            coeffs = np.polyfit(transformed_X, transformed_Y, degree)
            Y_pred = np.polyval(coeffs, transformed_X)
            pearson, p_val = pearsonr(transformed_X, transformed_Y)

        else:  # Linear fit
            slope, intercept, pearson, p_val, _ = linregress(transformed_X, transformed_Y)
            Y_pred = slope * transformed_X + intercept

        return {"R2": r2_score(Y, Y_pred), "pearson": pearson, "p_val": p_val}


    transformations = {
        "linear": {"transform_X": None, "transform_Y": None, "degree": None},
        "quadratic": {"transform_X": None, "transform_Y": None, "degree": 2},
        "cubic": {"transform_X": None, "transform_Y": None, "degree": 3},
        "log": {"transform_X": np.log, "transform_Y": None, "degree": None},
        "exp": {"transform_X": None, "transform_Y": np.log, "degree": None},
        "sqrt": {"transform_X": np.sqrt, "transform_Y": None, "degree": None},
    }

    for relationship, params in transformations.items():
        results[relationship] = fit_and_score(**params)

    return results


In [142]:
pd.DataFrame(evaluate_relationships(df_thetas_per_lambda['lambda'], df_thetas_per_lambda['L2_norm'])).T

Unnamed: 0,R2,pearson,p_val
linear,0.962134,-0.980884,1.768141e-71
quadratic,0.998526,-0.980884,1.768141e-71
cubic,0.999944,-0.980884,1.768141e-71
log,0.464395,-0.681465,6.029588e-15
exp,-25.271641,-0.994129,1.823606e-96
sqrt,0.968878,-0.984316,1.1809450000000002e-75


In [143]:
fig_rel

# Conclusion


This DGP doesn't really let us observe any interesting results, I'll try it out with 
more complex DGPs and see if we can observe anything