# Simple Linear Regression Example

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import t

## Part 1

Synthetic Data

In [2]:
x = np.array([1.1, 0.3, 1, 0.75, 1.8])
y = np.array([4.5, 2, 5, 2.4, 4.8])

(1) Estimate the correlation between $x$ and $y$.

$$Corr(x,y) = \dfrac{Cov(x,y)}{\sqrt{Var(x)Var(y)}}$$

$$Cov(x,y) = E(xy) - E(x) E(y)$$

$$Cov(x,y) = \dfrac{1}{n-1} [ \Sigma_{i-1}^n x_i y_i - n E(x) E(y)$$
where

$$E(X) = \dfrac{1}{n} \Sigma_{i-1}^n x_i $$

In [3]:
n = len(x)

In [4]:
cov_xy = (1/(n-1))*( np.sum(x*y) - n*np.mean(x)*np.mean(y) )
cov_xy

0.619250000000001

In [5]:
corr_xy = cov_xy / np.sqrt(np.var(x, ddof = 1) * np.var(y, ddof = 1))
corr_xy

0.7932494903728418

Approach 2: Numpy's *corrcoef*

In [6]:
r = np.corrcoef(x, y)
r

array([[1.        , 0.79324949],
       [0.79324949, 1.        ]])

Approach 3: Scipy

In [7]:
res = stats.pearsonr(x, y)
res

(0.7932494903728404, 0.10928342976611785)

(2) Find the estimators of $a$ and $b$.

In [8]:
b_hat = (np.sum(x*y) - n*np.mean(x)*np.mean(y))/ (  np.sum(x**2)  - n*np.mean(x)**2)
b_hat

2.0607321131447605

In [9]:
a = np.mean(y) - b_hat*np.mean(x)
a

1.699875207986687

(2a) Find the estimators of $a$ and $b$ using linear algebra.

In [10]:
X = np.stack((np.ones(len(x)),x), axis=1)
X

array([[1.  , 1.1 ],
       [1.  , 0.3 ],
       [1.  , 1.  ],
       [1.  , 0.75],
       [1.  , 1.8 ]])

In [11]:
Beta = np.linalg.inv(  X.T @ X) @ X.T @ y
Beta

array([1.69987521, 2.06073211])

(3) Find the estimators of $\sigma^2$.

In [12]:
y_pred = a+b_hat*x
var_xy = (1/(n-2)) * np.sum((y - y_pred)**2)
var_xy

1.002522185246811

(4) What is the distribution of $\hat b$?

$$ Var(\hat b) = \dfrac{\sigma^2}{\Sigma_{i-1}^n x_i^2 - n\bar x^2}$$

In [13]:
np.sum(x**2)  - n*np.mean(x)**2

1.2020000000000008

$$ Var(\hat b) = \dfrac{\sigma^2}{1.2}$$

(5) Assuming $\sigma$ is known with $\sigma^2 = 0.8$, test the hypothesis $H0 : b = 0$ with significance level $\alpha = 0.05$. Find the p-value.

In [14]:
sigma2 = 0.8
b_null = 0

In [15]:
var_b = sigma2 / 1.2
z = (b_hat - b_null) / np.sqrt(var_b)
z

2.5238710868859973

In [16]:
p = 2 * (1 - stats.norm.cdf(z))

$p < \alpha$ So, reject $H_o$

## Part 2

(1) Test the hypothesis $H0 : b = 0$ with $\sigma$ unknown and find the value of the $t$ statistic.

In [17]:
y_pred = a+b_hat*x
var_xy = (1/(n-2)) * np.sum((y - y_pred)**2)
var_b = var_xy / (np.sum(x**2)  - n*np.mean(x)**2)
var_b

0.8340450792402747

In [18]:
tv = (b_hat - b_null) / np.sqrt(var_b)
tv

2.256455518568256

$2.52 < t_{0.05, 3}$ so do not reject $H_o$