In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

# Loading Data

In [2]:
ciga = pd.read_csv("./data/2-supervisor.csv")
ciga.head()

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,43,51,30,39,61,92,45
1,63,64,51,54,63,73,47
2,71,70,68,69,76,86,48
3,61,63,45,47,54,84,35
4,81,78,56,66,71,83,47


# Problem (a)

## Fitting FM

In [3]:
target = ciga[['Y']]
full_x_data = ciga[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]

In [4]:
full_x_data = sm.add_constant(full_x_data, has_constant="add")
full_model = sm.OLS(target, full_x_data)
fitted_full_model = full_model.fit()
fitted_full_model.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.733
Model:,OLS,Adj. R-squared:,0.663
Method:,Least Squares,F-statistic:,10.5
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,1.24e-05
Time:,08:57:20,Log-Likelihood:,-97.25
No. Observations:,30,AIC:,208.5
Df Residuals:,23,BIC:,218.3
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.7871,11.589,0.931,0.362,-13.187,34.761
X1,0.6132,0.161,3.809,0.001,0.280,0.946
X2,-0.0731,0.136,-0.538,0.596,-0.354,0.208
X3,0.3203,0.169,1.901,0.070,-0.028,0.669
X4,0.0817,0.221,0.369,0.715,-0.376,0.540
X5,0.0384,0.147,0.261,0.796,-0.266,0.342
X6,-0.2171,0.178,-1.218,0.236,-0.586,0.152

0,1,2,3
Omnibus:,2.386,Durbin-Watson:,1.795
Prob(Omnibus):,0.303,Jarque-Bera (JB):,1.255
Skew:,-0.081,Prob(JB):,0.534
Kurtosis:,2.011,Cond. No.,1340.0


## Calculating SSE(FM)

In [5]:
fitted = fitted_full_model.predict(full_x_data)
fm_sse = 0
target_val = ciga['Y']
for i in range(ciga.shape[0]):
    fm_sse += (fitted[i] - target_val[i])**2
fm_sse

1149.0003248267374

# Problem (b)

## Fitting RM (exclude X2, X5)

In [6]:
reduced_x_data1 = ciga[['X1', 'X3', 'X4', 'X6']]
reduced_x_data1 = sm.add_constant(reduced_x_data1, has_constant="add")
reduced_model1 = sm.OLS(target, reduced_x_data1)
fitted_reduced_model1 = reduced_model1.fit()
fitted_reduced_model1.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.729
Model:,OLS,Adj. R-squared:,0.685
Method:,Least Squares,F-statistic:,16.77
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,8.44e-07
Time:,08:57:20,Log-Likelihood:,-97.477
No. Observations:,30,AIC:,205.0
Df Residuals:,25,BIC:,212.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.9917,8.241,1.455,0.158,-4.981,28.964
X1,0.5811,0.144,4.027,0.000,0.284,0.878
X3,0.2999,0.158,1.895,0.070,-0.026,0.626
X4,0.1062,0.205,0.518,0.609,-0.316,0.528
X6,-0.2289,0.168,-1.365,0.185,-0.574,0.117

0,1,2,3
Omnibus:,2.402,Durbin-Watson:,1.878
Prob(Omnibus):,0.301,Jarque-Bera (JB):,1.285
Skew:,-0.12,Prob(JB):,0.526
Kurtosis:,2.015,Cond. No.,781.0


## Calculating SSE(RM1)

In [7]:
fitted = fitted_reduced_model1.predict(reduced_x_data1)
rm_sse1 = 0
target_val = ciga['Y']
for i in range(ciga.shape[0]):
    rm_sse1 += (fitted[i] - target_val[i])**2
rm_sse1

1166.5651045239122

## Calculating F value

In [8]:
F_stats = ((rm_sse1-fm_sse)/(6-4))/(fm_sse/(ciga.shape[0]-7))
F_stats

0.1758006174175537

In [9]:
from scipy.stats import f
f_dist = f(2, 27)
1-f_dist.cdf(F_stats)

0.8397376163900425

# Problem (d)

## Fitting RM(exclude X3)

In [10]:
reduced_x_data2 = ciga[['X1', 'X2', 'X4', 'X5', 'X6']]
reduced_x_data2 = sm.add_constant(reduced_x_data2, has_constant="add")
reduced_model2 = sm.OLS(target, reduced_x_data2)
fitted_reduced_model2 = reduced_model2.fit()
fitted_reduced_model2.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.691
Model:,OLS,Adj. R-squared:,0.626
Method:,Least Squares,F-statistic:,10.71
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,1.69e-05
Time:,08:57:20,Log-Likelihood:,-99.439
No. Observations:,30,AIC:,210.9
Df Residuals:,24,BIC:,219.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.2546,12.052,1.183,0.248,-10.619,39.128
X1,0.7055,0.162,4.365,0.000,0.372,1.039
X2,-0.0378,0.142,-0.267,0.792,-0.330,0.254
X4,0.1685,0.228,0.738,0.468,-0.303,0.639
X5,-0.0163,0.152,-0.107,0.915,-0.330,0.297
X6,-0.0994,0.176,-0.565,0.577,-0.463,0.264

0,1,2,3
Omnibus:,5.583,Durbin-Watson:,2.098
Prob(Omnibus):,0.061,Jarque-Bera (JB):,2.285
Skew:,-0.337,Prob(JB):,0.319
Kurtosis:,1.828,Cond. No.,1230.0


# Problem (e)

## Fitting RM(include X1, X3, X6)

In [11]:
reduced_x_data3 = ciga[['X1', 'X3', 'X6']]
reduced_x_data3 = sm.add_constant(reduced_x_data3, has_constant="add")
reduced_model3 = sm.OLS(target, reduced_x_data3)
fitted_reduced_model3 = reduced_model3.fit()
fitted_reduced_model3.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.694
Method:,Least Squares,F-statistic:,22.92
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,1.81e-07
Time:,08:57:20,Log-Likelihood:,-97.638
No. Observations:,30,AIC:,203.3
Df Residuals:,26,BIC:,208.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,13.5777,7.544,1.800,0.084,-1.929,29.084
X1,0.6227,0.118,5.271,0.000,0.380,0.866
X3,0.3124,0.154,2.026,0.053,-0.005,0.629
X6,-0.1870,0.145,-1.291,0.208,-0.485,0.111

0,1,2,3
Omnibus:,2.856,Durbin-Watson:,1.938
Prob(Omnibus):,0.24,Jarque-Bera (JB):,1.394
Skew:,-0.121,Prob(JB):,0.498
Kurtosis:,1.972,Cond. No.,605.0


# Problem (f)

## Fitting RM (include X3)

In [12]:
reduced_x_data4 = ciga[['X3']]
reduced_x_data4 = sm.add_constant(reduced_x_data4, has_constant="add")
reduced_model4 = sm.OLS(target, reduced_x_data4)
fitted_reduced_model4 = reduced_model4.fit()
fitted_reduced_model4.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.389
Model:,OLS,Adj. R-squared:,0.367
Method:,Least Squares,F-statistic:,17.82
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,0.000231
Time:,08:57:21,Log-Likelihood:,-109.65
No. Observations:,30,AIC:,223.3
Df Residuals:,28,BIC:,226.1
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,28.1741,8.815,3.196,0.003,10.118,46.230
X3,0.6468,0.153,4.222,0.000,0.333,0.961

0,1,2,3
Omnibus:,2.077,Durbin-Watson:,1.541
Prob(Omnibus):,0.354,Jarque-Bera (JB):,1.269
Skew:,-0.5,Prob(JB):,0.53
Kurtosis:,3.131,Cond. No.,287.0
