In [None]:
import numpy as np
import pandas as pd
import pymc as pm
import scipy.stats as stats

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/Jazli14/sta365/main/nfl_standings.csv')


df['playoffs'].replace('Playoffs', 1, inplace=True)
df['playoffs'].replace('No Playoffs', 0, inplace=True)

X = df[['wins']]

y = df[['playoffs']]

with pm.Model() as model:
    # Standard normal priors for unknown beta parameters
    b0 = pm.Normal('b0', mu=0, sigma=1)
    b1 = pm.Normal('b1', mu=0, sigma=1)

    # Expected value of outcome
    mu = b0 + b1*X

    # Bernoulli distribution with logit function
    Y_obs = pm.Bernoulli('Y_obs',logit_p=mu, observed=y)

with model:
    idata = pm.sample()


# Part 2

## Ridge Regression

$$
\text{Posterior} \propto \text{Prior} \times \text{Likelihood}
$$

$$
\Rightarrow \prod_{i=1}^{n} \frac{1}{\sqrt{2\pi}}e^{\frac{-1}{2}(\frac{\beta_i}{s_i})^2} \prod_{i=1}^n \frac{1}{\sqrt{2\pi}} e^{\frac{-1}{2}(y_k - x_k^{\top}\beta)^2}
$$


$$
= (\frac{1}{2\pi})^n \prod_{i=1}^{n} e^{\frac{-1}{2}(\frac{\beta_i}{s_i})^2} \prod e^{\frac{-1}{2} (y_k - x_k^{\top}\beta)^2}
$$

$$\text{Apply log to the equation}$$

$$
\propto \sum_{i=1}^n \log{[e^{\frac{-1}{2}(\frac{\beta_i}{s_i})^2}]} + \sum_{i=1}^n \log(e^{\frac{-1}{2}(y_k - x_k^{\top}\beta)^2})
$$

$$
\propto \sum_{i=1}^n -\frac{1}{2}(\frac{1}{s_i})^2 \beta_i^2 + \sum_{i=1}^n -\frac{1}{2} (y_k-x_k^\top\beta)^2
$$

$$
\propto - \sum_{k = 1} \frac{1}{2}(y_k -x_k^\top\beta)^2 - \frac{1}{2} \sum_{i=1}^n (\frac{1}{s_i})^2 \beta_i^2
$$
$$\text{This term resembles} \sum_{k=1}^n \frac{1}{2}(y_k-x_k^\top\beta)^2 + \lambda\sum_{i=1}^{n}\beta_i^2
$$
$$\text{As we treat } \big(\frac{1}{s_i}\big)^2 \text{ to be equal to } \lambda$$


## Lasso Regression

$$
\text{Posterior} \propto \text{Prior} \times \text{Likelihood}
$$

$$
\Rightarrow \prod_{i=1}^{n} \frac{1}{2s_i}e^{\frac{-1}{2}(\frac{|\beta_i|}{s_i})^2} \prod_{k=1}^n \frac{1}{\sqrt{2\pi}} e^{\frac{-1}{2}(y_k - x_k^{\top}\beta)^2}
$$

$$
\propto \sum_{i=1}^n \log\big({\frac{1}{2s_i}e^\frac{-|\beta_i|}{s_i}}\big) + \sum_{k=1}^n \log(\frac{1}{\sqrt{2\pi}} e^{\frac{-1}{2}(y_k - x_k^{\top}\beta)^2})
$$

$$
\propto \sum_{i=1}^n [\log(\frac{1}{2s_i}) + \frac{-|\beta_i|}{s_i}]+ \sum_{k=1}^{n} [\log(\frac{1}{\sqrt{2\pi}}) -\frac{1}{2}(y_k-x_i^\top\beta)^2]
$$

$$
\sum_{k=1}^n \log(\frac{1}{2s_i} + \sum_{i=1}^n -\frac{|\beta_i|}{s_i} - \sum_{k=1}^n \frac{1}{2} (y_k - x_k^\top\beta)^2
$$


$$\text{This term now resembles} \sum_{k=1}^n \frac{1}{2}(y_k-x_k^\top\beta)^2 + \lambda\sum_{i=1}^{n}|\beta_i|
$$
$$\text{As the term } -\frac{1}{s_i}\text{ is equal to }\lambda
$$

## Explain "Bayesians do not optimize posterior distributions, they sample from them; but, the posterior distributions are nonetheless 'regularizations' of the likelihood through the prior."

In Bayesian statistics, the emphasis lies in estimating the complete posterior distribution of parameters based on observed data, rather than solely seeking a single point estimate as in frequentist statistics.

The posterior distribution encompasses both the likelihood (data) and the prior, being proportional to their multiplication. The regularization effect allows for the management of the prior's influence on the posterior. By tuning the parameter $\lambda$, we can control the extent to which the prior shapes the posterior distribution.