### Import Libraries

In [33]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.linear_model import LinearRegression

### Read Data

df = pd.read_csv('HW3_data.csv')
df.head()

In [7]:
yi = df['y'].to_numpy()
mi = df['m'].to_numpy()
xi = df['x'].to_numpy()

1) What is the null hypothesis of the Sobel test?

The null hypothesis says that there is no mediation effect between *m* and *x* if $\beta_{22} = 0$ or $\beta_{32} = 0$ and $\beta_{12} = \beta_{33}$. Then: 

\begin{align}
Z = \frac{\hat\beta_{12}-\hat\beta_{33}}{\sqrt{\hat\beta_{22}^2Var(\hat\beta_{32}^2) + 
            \hat\beta_{32}^2Var(\hat\beta_{22}^2)}}
\end{align}

2) Regress the three models and and the estimators.

In [30]:
def get_linear_params(x, y):
    """
    Returns the slope and intercept of the regression line
    """
    # Compute Stats
    x_y_bar = np.mean(x * y)
    x_bar = np.mean(x)
    y_bar = np.mean(y)
    x2_bar = np.mean(x**2)
    
    slope = (x_y_bar - x_bar * y_bar ) / (x2_bar - x_bar**2)
    intercept = y_bar - slope * x_bar
    
    return slope, intercept

In [85]:
def find_estimators(y, *args):
    X = np.stack((args), axis=1)
    a1 = np.matmul(X.T, X)
    inv_a1 = np.linalg.inv(a1)
    a2 = np.matmul(inv_a1, X.T)
    res = np.matmul(a2, y)
    return res

def h_hat_matrix(*args):
    """
    Returns the H-hat matrix for any number of 
        
    """
    X = np.stack((args), axis=1)
    a1 = np.matmul(X.T, X)
    inv_a1 = np.linalg.inv(a1)
    a2 = np.matmul(X, inv_a1)
    h_hat = np.matmul(a2, X.T)
    return h_hat

def get_residual_SS(y, *args):
    """Calculates the residual sum of squares"""
    
    H = h_hat_matrix(*args)
    I = np.identity(H.shape[0])
    ss = np.matmul(np.matmul(y.T, (I-H)), y)
    return ss

\begin{align}
\hat\sigma^2 =  \frac{1}{n-p} \hat e' \hat e
\end{align}

---------

\begin{align}
\text{Model #1} \hspace{5mm}
y_i = \beta_{11} + \beta_{12}x_i+\epsilon_i
\end{align}

In [78]:
#Option 1: Simple Linear Case
B12, B11 = get_linear_params(xi,yi)
print(B11, B12)

# Option 2: Linear Algebra
find_estimators(yi, np.ones(yi.shape), xi)

2.9772904369906783 1.7841228898300225


array([2.97729044, 1.78412289])

\begin{align}
y_i = 2.9772 + 1.7841x_i+\epsilon_i
\end{align}

---------

\begin{align}
\text{Model #2} \hspace{5mm}
m_i = \beta_{21} + \beta_{22}x_i+\epsilon_i
\end{align}

In [32]:
B22, B21 = get_linear_params(xi,mi)
print(B21, B22)

0.9740536557783886 0.9104199975762058


\begin{align}
m_i = 0.9740 + 0.9104x_i+\epsilon_i
\end{align}

---------

\begin{align}
\text{Model #3} \hspace{5mm}
y_i = \beta_{31} + \beta_{32}m_i + \beta_{33}x_i + \epsilon_i
\end{align}

In [79]:
# Option 1: Linear Algebra
find_estimators(yi, np.ones(yi.shape), mi, xi)

array([1.1786478 , 1.84655397, 0.10298323])

In [80]:
#Option 2: sklearn library

mlr = LinearRegression()
mlr.fit(X = np.stack((mi, xi), axis = -1), y = yi);
print("B32 and B33 respectively = ", mlr.coef_)
print("B31 = ", mlr.intercept_)

B32 and B33 respectively =  [1.84655397 0.10298323]
B31 =  1.1786477956297094


\begin{align}
y_i = 1.1786 + 1.8465 m_i + 0.1029 x_i + \epsilon_i
\end{align}

# 

3) Find the variance of B_22 and B_32.

\begin{align}
Cov \hat \beta = \sigma^2 ( X'X)^{-1}
\end{align}

\begin{align}
\hat\sigma^2 =  \frac{1}{n-p} \hat e' \hat e
\end{align}

In [104]:
X_mod_2 = X = np.stack((np.ones(yi.shape), xi), axis=1)
mod_2_res = get_residual_SS(mi, np.ones(yi.shape), xi)
variance_mod_2 = (1 / (200-2))*mod_2_res
cov_mod_2 = variance_mod_2 * np.linalg.inv(np.matmul(X.T, X))
var_B_22 = cov_mod_2[1][-1]
print('Variance of B22: ', var_B_22)

Variance of B22:  0.005728784301207686


In [105]:
X_mod_3 = X = np.stack((np.ones(yi.shape), mi, xi), axis=1)
mod_3_res = get_residual_SS(yi, np.ones(yi.shape), mi, xi)
variance_mod_3 = (1 / (200-3))*mod_3_res
cov_mod_3 = variance_mod_3 * np.linalg.inv(np.matmul(X.T, X))
var_B_32 = cov_mod_3[1][1]
print('Variance of B32: ', var_B_32)

Variance of B32:  0.004953382792054966


# 

4) Find the z test statistic.

In [109]:
Z = (B12 - 0.10298323 )  / np.sqrt( (B22**2)*var_B_32 + (1.84655397**2)*var_B_22 )
Z

10.934146977829494

# 

5) Find the p-value and the conclusion of the test.

In [108]:
import scipy.stats

In [111]:
scipy.stats.norm.sf(abs(Z)) * 2 # Two-sided

7.914680445902254e-28

We have very strong evidence to reject the null hypothesis (Null hypothesis says there is no mediation effect).