# Loading Data & Library

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

In [3]:
superv = pd.read_csv("./data/2-supervisor.csv")
superv

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6
0,43,51,30,39,61,92,45
1,63,64,51,54,63,73,47
2,71,70,68,69,76,86,48
3,61,63,45,47,54,84,35
4,81,78,56,66,71,83,47
5,43,55,49,44,54,49,34
6,58,67,42,56,66,68,35
7,71,75,50,55,70,66,41
8,72,82,72,67,71,83,31
9,67,61,45,47,62,80,41


# Problem (a)

## Fitting FM

In [4]:
target = superv[['Y']]
full_data1 = superv[['X1', 'X3']]

In [5]:
full_data1 = sm.add_constant(full_data1, has_constant="add")
full_model1 = sm.OLS(target, full_data1)
fitted_full_model1 = full_model1.fit()
fitted_full_model1.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.708
Model:,OLS,Adj. R-squared:,0.686
Method:,Least Squares,F-statistic:,32.74
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,6.06e-08
Time:,10:26:26,Log-Likelihood:,-98.569
No. Observations:,30,AIC:,203.1
Df Residuals:,27,BIC:,207.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.8709,7.061,1.398,0.174,-4.618,24.359
X1,0.6435,0.118,5.432,0.000,0.400,0.887
X3,0.2112,0.134,1.571,0.128,-0.065,0.487

0,1,2,3
Omnibus:,6.448,Durbin-Watson:,1.958
Prob(Omnibus):,0.04,Jarque-Bera (JB):,1.959
Skew:,-0.041,Prob(JB):,0.375
Kurtosis:,1.751,Cond. No.,503.0


### ANOVA Table for FM 

In [7]:
concat_fulldata1 = pd.concat([target, full_data1], axis=1)
formula_model1 = sm.OLS.from_formula('Y ~ X1 + X3', concat_fulldata1)
fitted_formula_model1 = formula_model1.fit()
anova_lm(fitted_formula_model1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
X1,1.0,2927.584253,2927.584253,63.001506,1.564999e-08
X3,1.0,114.733443,114.733443,2.46906,0.1277539
Residual,27.0,1254.648971,46.46848,,


### Calculating SSE(FM)

In [8]:
fitted = fitted_full_model1.predict(full_data1)
fm_sse = 0
target_val = superv['Y']
for i in range(superv.shape[0]):
    fm_sse += (fitted[i] - target_val[i])**2
fm_sse

1254.648970592878

## Fitting RM

### Calculating constant coefficient of RM

In [7]:
target_mean = float(np.mean(target))
x1_mean = float(np.mean(superv[['X1']]))
x3_mean = float(np.mean(superv[['X3']]))
const_coef = target_mean - 0.5*x1_mean - 0.5*x3_mean
const_coef

3.1500000000000092

### Calculating SSE(RM)

In [8]:
rm_sse = 0
for i, row in superv.iterrows():
    rm_sse += (row['Y'] - const_coef - 0.5*row['X1'] - 0.5*row['X3'])**2
rm_sse

1469.575

## Calculating F value

In [9]:
from scipy.stats import f

In [10]:
F_stats = ((rm_sse-fm_sse)/2)/(fm_sse/(superv.shape[0]-3))
F_stats

2.312600149526332

### get p-value

In [11]:
f_dist = f(2, 27)
1-f_dist.cdf(F_stats)

0.11829366059475344

# Problem (b)

## Fitting FM

In [12]:
target = superv[['Y']]
full_data2 = superv[['X1', 'X2', 'X3']]

In [13]:
full_data2 = sm.add_constant(full_data2, has_constant="add")
full_model2 = sm.OLS(target, full_data2)
fitted_full_model2 = full_model2.fit()
fitted_full_model2.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.715
Model:,OLS,Adj. R-squared:,0.682
Method:,Least Squares,F-statistic:,21.74
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,2.94e-07
Time:,10:29:35,Log-Likelihood:,-98.206
No. Observations:,30,AIC:,204.4
Df Residuals:,26,BIC:,210.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.2583,7.318,1.538,0.136,-3.785,26.301
X1,0.6824,0.129,5.296,0.000,0.418,0.947
X2,-0.1033,0.129,-0.799,0.432,-0.369,0.163
X3,0.2380,0.139,1.707,0.100,-0.049,0.525

0,1,2,3
Omnibus:,4.366,Durbin-Watson:,1.793
Prob(Omnibus):,0.113,Jarque-Bera (JB):,1.641
Skew:,-0.018,Prob(JB):,0.44
Kurtosis:,1.855,Cond. No.,606.0


### ANOVA Table for FM

In [14]:
concat_fulldata2 = pd.concat([target, x_data2], axis=1)
model22 = sm.OLS.from_formula('Y ~ X1 + X2 + X3', concat_fulldata2)
fitted_model22 = model22.fit()
anova_lm(fitted_model22)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
X1,1.0,2927.584253,2927.584253,62.155947,2.324167e-08
X2,1.0,7.518562,7.518562,0.159628,0.6927628
X3,1.0,137.247472,137.247472,2.91392,0.09973547
Residual,26.0,1224.616381,47.10063,,


### Calculating SSE(FM)

In [15]:
fitted = fitted_model2.predict(x_data2)
fm_sse2 = 0
target_val = superv['Y']
for i in range(superv.shape[0]):
    fm_sse2 += (fitted[i] - target_val[i])**2
fm_sse2

1224.6163806588245

## Fitting RM

### new_target = target - 0.5X1 - 0.5X3

In [16]:
new_target = pd.DataFrame(columns=['Y'])
for idx in range(superv.shape[0]):
    new_target = new_target.append(pd.DataFrame([target.loc[idx] - superv.loc[idx, 'X1']*0.5 - superv.loc[idx, 'X3']*0.5], 
                                                columns=['Y']), ignore_index=True)
new_target.head()

Unnamed: 0,Y
0,-2.0
1,4.0
2,1.5
3,6.0
4,9.0


### Fitting new target onto X2

In [17]:
x_data3 = superv[['X2']]
x_data3 = sm.add_constant(x_data3, has_constant="add")
model3 = sm.OLS(new_target, x_data3)
fitted_model3 = model3.fit()
fitted_model3.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.04
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,1.169
Date:,"Thu, 29 Apr 2021",Prob (F-statistic):,0.289
Time:,08:57:14,Log-Likelihood:,-100.33
No. Observations:,30,AIC:,204.7
Df Residuals:,28,BIC:,207.5
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.3378,5.869,1.591,0.123,-2.684,21.359
X2,-0.1165,0.108,-1.081,0.289,-0.337,0.104

0,1,2,3
Omnibus:,1.366,Durbin-Watson:,1.491
Prob(Omnibus):,0.505,Jarque-Bera (JB):,1.111
Skew:,0.255,Prob(JB):,0.574
Kurtosis:,2.207,Cond. No.,247.0


### Calculating SSE(RM)

In [18]:
data33 = pd.concat([new_target, x_data3], axis=1)
model33 = sm.OLS.from_formula('Y ~ X2', data33)
fitted_model33 = model33.fit()
anova_lm(fitted_model33)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
X2,1.0,58.881337,58.881337,1.1687,0.288892
Residual,28.0,1410.693663,50.381917,,


In [19]:
fitted = fitted_model3.predict(x_data3)
rm_sse2 = 0
target_val = new_target['Y']
for i in range(superv.shape[0]):
    rm_sse2 += (fitted[i] - target_val[i])**2
rm_sse2

1410.6936634317126

## Calculating F value

In [20]:
F_stats2 = ((rm_sse2-fm_sse2)/2)/(fm_sse2/(superv.shape[0]-3))
F_stats2

2.051289985262609

### get p-value

In [21]:
f_dist = f(2, 27)
1-f_dist.cdf(F_stats2)

0.14813552061172064