## 0. Setup

In [1]:
# import libraries we need
import pandas as pd
import numpy as np
import scipy.stats as stats

## 1. Create t_test function

In [4]:
# Create t-test function
def t_test(num_var, bin_var):  
    
    # Calculate the mean difference between the two groups
    group1 = num_var[bin_var == 0]
    group2 = num_var[bin_var == 1]
    mean_diff = group2.mean() - group1.mean()
    
    # Calculate the sample sizes and sample variances of the two groups
    n1 = group1.shape[0]
    n2 = group2.shape[0]
    s2_1 = group1.var()
    s2_2 = group2.var()
    
    # Calculate the degrees of freedom (DF)
    DF = n1 + n2 - 2
    
    # Calculate the pooled standard deviation (sp)
    sp = np.sqrt(((n1 - 1) * s2_1 + (n2 - 1) * s2_2) / DF)
    
    # Calculate the standard error of the mean difference
    SE_mean_diff = sp * np.sqrt(1/n1 + 1/n2)
    
    # Calculate the t-statistic
    t_statistic = mean_diff / SE_mean_diff
    
    # Calculate the p-value
    p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), DF))
    
    # Put the results into a DataFrame
    results = pd.DataFrame({
        "Continuous Variable": [num_var.name],
        "Binary Variable": [bin_var.name],
        "Total Sample Size": [n1 + n2],
        "Mean Difference": [round(mean_diff, 2)], 
        "SE of Mean Difference": [round(SE_mean_diff, 2)],
        "DF": [DF],
        "t-statistic": [round(t_statistic, 3)],
        "P-value": [("%.3f" % p_value).lstrip('0')],
        "Test": "Independent samples t-test"
    })
    
    return results


## 2. Compare the result

In [5]:
# import data 
path = r"C:\Users\nicho\Desktop\1 Modern Data Structures - GR5072\QMSS-GR5072_Spring2024\Week 5\Activity"
data = pd.read_csv(path + r"\3 ged_data.csv")  
data

Unnamed: 0.1,Unnamed: 0,STU_ID,SCH_ID,F3ERN2011,F3C02,F3EVRGED,F3EVERDO,BYMOTHED,BYS14,BYRACE,...,high_school_grad,ged,BYRACE2,Other,Asian,Black,Hispanic,White,post_sec_edu,income_log
0,2,101104,1011,37000,50,0,0,2,2,7,...,1,0,7,0,0,0,0,1,4,10.518673
1,5,101107,1011,35000,40,0,0,2,1,4,...,1,0,4,0,0,0,1,0,0,10.463103
2,7,101109,1011,68000,40,0,0,2,1,7,...,1,0,7,0,0,0,0,1,4,11.127263
3,10,101112,1011,18000,1,0,0,6,1,3,...,1,0,3,0,0,1,0,0,0,9.798127
4,18,101120,1011,1000,40,1,1,2,1,7,...,0,1,7,0,0,0,0,1,0,6.907755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971,16179,461202,4612,20000,38,0,1,1,2,5,...,0,0,4,0,0,0,1,0,0,9.903488
5972,16181,461205,4612,100,6,0,1,2,2,5,...,0,0,4,0,0,0,1,0,0,4.605170
5973,16182,461207,4612,29000,40,0,0,3,2,5,...,1,0,4,0,0,0,1,0,2,10.275051
5974,16185,461214,4612,15000,44,0,1,2,1,5,...,0,0,4,0,0,0,1,0,0,9.615805


In [6]:
# result from ttest_ind
output1 = stats.ttest_ind(data['income_log'][data['ged'] == 1], data['income_log'][data['ged'] == 0])  

# result from t_test function 
output2 = t_test(num_var=data["income_log"], bin_var=data["ged"])

print(output1) 

Ttest_indResult(statistic=-7.457945705706778, pvalue=1.0040955835124666e-13)


In [7]:
output2

Unnamed: 0,Continuous Variable,Binary Variable,Total Sample Size,Mean Difference,SE of Mean Difference,DF,t-statistic,P-value,Test
0,income_log,ged,5976,-0.48,0.06,5974,-7.458,0.0,Independent samples t-test


## 3.  Result Analysis

The results of the t-test indicate obtaining a General Educational Development (GED) is associated with income. On average, GED holders earn 48% less ((SE = 0.06), t(DF = 5974) = -7.458, p < .001).  

## 4. Three Additional Tests

In [8]:
# Test 1: Income and high_school_grad
results1 = t_test(num_var=data["income_log"], bin_var=data["high_school_grad"])

# Test 2: Income and female
results2 = t_test(num_var=data["income_log"], bin_var=data["female"])

# Test 3: Post_sec and female
results3 = t_test(num_var=data["post_sec_edu"], bin_var=data["female"]) 

In [9]:
results1

Unnamed: 0,Continuous Variable,Binary Variable,Total Sample Size,Mean Difference,SE of Mean Difference,DF,t-statistic,P-value,Test
0,income_log,high_school_grad,5976,0.5,0.05,5974,11.131,0.0,Independent samples t-test


In [10]:
results2

Unnamed: 0,Continuous Variable,Binary Variable,Total Sample Size,Mean Difference,SE of Mean Difference,DF,t-statistic,P-value,Test
0,income_log,female,5976,-0.21,0.02,5974,-8.685,0.0,Independent samples t-test


In [11]:
results3

Unnamed: 0,Continuous Variable,Binary Variable,Total Sample Size,Mean Difference,SE of Mean Difference,DF,t-statistic,P-value,Test
0,post_sec_edu,female,5976,0.66,0.06,5974,11.628,0.0,Independent samples t-test


## 5.  Linear Regression

### Result Comparison

The linear regression results are identical to the t-test results. They are both testing the same hypothesis and shared the same assumptions!

In [12]:
x=data["ged"]
y=data["income_log"]
slope, intercept, r, p, std_err = stats.linregress(x, y)
print("Slope: " + str(slope))
print("SE: " + str(std_err))
print("t-statistic: " + str( slope / std_err ))
print("p-value: " + str(p))

Slope: -0.4763000664486296
SE: 0.06386478063043147
t-statistic: -7.457945705706744
p-value: 1.0040955835127352e-13


#### Question: Why is the linear regression approach from the sklearn library different?

In [13]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [14]:
X = data.ged
#X = sm.add_constant(X)
y = data.income_log

lg = sm.OLS(y, X)
est = lg.fit()
print(est.summary())

                                 OLS Regression Results                                
Dep. Variable:             income_log   R-squared (uncentered):                   0.034
Model:                            OLS   Adj. R-squared (uncentered):              0.034
Method:                 Least Squares   F-statistic:                              212.2
Date:                Sun, 11 Feb 2024   Prob (F-statistic):                    2.88e-47
Time:                        10:08:02   Log-Likelihood:                         -22201.
No. Observations:                5976   AIC:                                  4.440e+04
Df Residuals:                    5975   BIC:                                  4.441e+04
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------