In [1]:
#Kaylena Mann
#ADEC7430
#Homework_1

#importing libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
# STEP 1. Create a dataset with 10,000 rows and 4 random variables 

n = 10000
np.random.seed(39) #generating seed for reproducability 

#random normal variables with mean of 0 and SD of 1 (default)
x1 = np.random.normal(0, 1, n)
x2 = np.random.normal(0, 1, n)
#random uniform variables with min of 0 and max of 1 (default)
x3 = np.random.uniform(0, 1, n)
x4 = np.random.uniform(0, 1, n)
x2_sq = x2**2

In [3]:
# STEP 2. Add another variable ("Y") as the linear combination, with some coefficients and noise.

# The coefficients were based on this study: https://digitalcommons.coastal.edu/cgi/viewcontent.cgi?article=1220&context=etd!
b0 = -1.32 # intercept
b1 = 0.23  # Covid Cases
b2 = -0.01 # Enrollment Intensity
b3 = 0.74 # Campus Setting
b4 = 0.13 # Political affiliation 
sq_term = -0.3 #squared term

#adding error
error = np.random.normal(0, 0.5, n)   

#generating the equation for y
y = (b0 + b1*x1 + b2*x2 + b3*x3 + b4*x4 + sq_term*x2_sq +
     error)
#creating the data_frame
data = pd.DataFrame({'x1': x1,'x2': x2,'x3': x3,'x4': x4, 'x2_sq': x2_sq, 'y': y})
print(data.head())

         x1        x2        x3        x4     x2_sq         y
0  1.404840 -0.412341  0.701333  0.909778  0.170025  1.043773
1  0.221121 -1.026576  0.751686  0.521624  1.053858 -1.611487
2 -0.145327 -1.585903  0.074105  0.851362  2.515089 -1.509581
3  0.123199 -0.358885  0.444786  0.973354  0.128799 -1.516847
4  0.606027 -1.320395  0.856817  0.499827  1.743443 -0.639308


In [4]:
# STEP 3. Split the dataset into 70% for training and 30% for testing

train, test = train_test_split(data, 
                                test_size = .30,
                                random_state = 39)

# Adding variables to training and testing
X_train = train[['x1','x2','x3','x4','x2_sq']]
y_train = train['y']
X_test = test[['x1','x2','x3','x4','x2_sq']]
y_test = test['y']

# adding intercept
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

In [5]:
# STEP 4. Estimate the linear regression coefficients using OLS, compute the MSE on both datasets

model = sm.OLS(y_train, X_train_sm)
results = model.fit()

#Calculating the MSE
yhat_test = results.predict(X_test_sm) #Predicted outcome on the test data
yhat_train = results.predict(X_train_sm) #Predicted outcome on the train data
MSE_test = np.mean((y_test - yhat_test)**2) #subtracting the predicted values from actual values and squaring for test MSE
MSE_train = np.mean((y_train - yhat_train)**2) #subtracting the predicted values from actual values and squaring for training MSE

# printing all results
print(results.summary()) 
print("Training MSE:", MSE_train) 
print("Test MSE:", MSE_test)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.529
Method:                 Least Squares   F-statistic:                     1573.
Date:                Wed, 17 Sep 2025   Prob (F-statistic):               0.00
Time:                        22:27:53   Log-Likelihood:                -5015.8
No. Observations:                7000   AIC:                         1.004e+04
Df Residuals:                    6994   BIC:                         1.008e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.3462      0.016    -82.353      0.0

In [6]:
# STEP 5. Use bootstrapping to create 10 other samples from the data I created

B = 10
coef_rows = []

#For loop using the full original data with replacement, running OLS, and then appending the parameters into the coef_rows
for b in range(B):
    bootstrap = data.sample(10000, replace=True, random_state=33+b)
    model = smf.ols('y ~ x1 + x2 + x3 + x4 + x2_sq', data = bootstrap).fit()
    coef_rows.append(model.params.reindex(['Intercept','x1','x2','x3','x4','x2_sq']))

In [7]:
# STEP 6. Estimate the linear regression coefficients using OLS for each of the 10 bootstrap samples

bootstrap_sample_coefs = pd.DataFrame(coef_rows)

In [8]:
# STEP 7. Compute mean and standard deviation for each parameter

bootstrap_summary = pd.DataFrame({'mean': bootstrap_sample_coefs.mean(), 'std': bootstrap_sample_coefs.std(ddof=1)})
print(bootstrap_summary)
print(bootstrap_sample_coefs)

               mean       std
Intercept -1.327371  0.010017
x1         0.232018  0.002619
x2        -0.004880  0.003324
x3         0.729014  0.011309
x4         0.165079  0.011594
x2_sq     -0.304516  0.004015
   Intercept        x1        x2        x3        x4     x2_sq
0  -1.310553  0.233044 -0.006173  0.713527  0.162566 -0.303298
1  -1.314000  0.232813 -0.004744  0.708460  0.168787 -0.307747
2  -1.339257  0.235744 -0.011322  0.734219  0.176820 -0.303338
3  -1.324174  0.234400 -0.008626  0.723928  0.168417 -0.312357
4  -1.332990  0.228568 -0.003446  0.741371  0.149775 -0.298286
5  -1.319219  0.228544 -0.005636  0.726845  0.149709 -0.299715
6  -1.335273  0.229142 -0.001836  0.740411  0.154047 -0.307150
7  -1.330690  0.231361 -0.000029  0.727077  0.165447 -0.304431
8  -1.329514  0.234802 -0.004699  0.739621  0.169416 -0.303558
9  -1.338043  0.231763 -0.002284  0.734685  0.185805 -0.305282


In [9]:
# STEP 8. What can you say about the coefficients in STEP #4 when looking at STEP #7?
print(bootstrap_summary)
print(results.summary()) 

print('''CONCLUSION
The bootstrap results very closely match the estimates from our original OLS validation training model. On average, the bootstrap means were off by about 0.01-0.02. The standard deviations from bootstrapping are small and approximate the standard error values from the original regression, indicating high precision in coefficients and good reliability in our  standard errors. Overall, this makes us more confident in the original results provided from validation and indicates that our model is less susceptible to sampling variability, performing consistently across resamples. This makes sense, because there was only a tiny amount of a noise added when simulating the data. Not only did our model generalize well with new test data without evidence of overfitting, but it also demonstrated stability across resampling. ''')

               mean       std
Intercept -1.327371  0.010017
x1         0.232018  0.002619
x2        -0.004880  0.003324
x3         0.729014  0.011309
x4         0.165079  0.011594
x2_sq     -0.304516  0.004015
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.529
Method:                 Least Squares   F-statistic:                     1573.
Date:                Wed, 17 Sep 2025   Prob (F-statistic):               0.00
Time:                        22:28:02   Log-Likelihood:                -5015.8
No. Observations:                7000   AIC:                         1.004e+04
Df Residuals:                    6994   BIC:                         1.008e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
