# Package

In [1]:
from pandas import read_csv
from pandas import DataFrame
from sklearn.datasets import load_diabetes
from numpy import insert
import statsmodels.api as sm
from scipy.stats import t
from sklearn.linear_model import LinearRegression

# Load Data

In [2]:
diabetes = load_diabetes()

In [3]:
df = DataFrame(diabetes.data, columns= diabetes.feature_names)
df["target"] = diabetes.target

# Covariates with intercept

In [4]:
X = df.drop("target", axis = 1)

In [5]:
X_tilde = sm.add_constant(X)

In [6]:
X_tilde

Unnamed: 0,const,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,1.0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,1.0,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,1.0,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,1.0,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,1.0,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...,...
437,1.0,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,1.0,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,1.0,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,1.0,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [7]:
X_tilde.head()

Unnamed: 0,const,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,1.0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,1.0,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,1.0,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,1.0,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,1.0,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


# Linear model without intercept

In [19]:
model = sm.OLS(Y, X).fit()

# Let's test if regression coefficien is significant

Consider the coefficient θ in the linear regression model with one independent
variable.

**Hypotheses :** 
- Null Hypothesis (H0) : θ = 0 (The coefficient is not statistically
significant).
- Alternative Hypothesis (H1) : θ 6= 0 (The coefficient is statistically
significant).

**Test Statistic :** - t = θb / SE(θb)

**Critical Value :** - tα/2 for a two-tailed test.

**Decision Rule :** - If |t| > tα/2, reject H0 in favor of H1.

**Conclusion :** - Based on the test, we conclude whether the coefficient θ is
statistically significant or not.

In [20]:
model.summary()

0,1,2,3
Dep. Variable:,target,R-squared (uncentered):,0.106
Model:,OLS,Adj. R-squared (uncentered):,0.085
Method:,Least Squares,F-statistic:,5.1
Date:,"Fri, 13 Jun 2025",Prob (F-statistic):,4.72e-07
Time:,11:15:51,Log-Likelihood:,-2873.9
No. Observations:,442,AIC:,5768.0
Df Residuals:,432,BIC:,5809.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-10.0099,179.967,-0.056,0.956,-363.729,343.709
sex,-239.8156,184.404,-1.300,0.194,-602.255,122.624
bmi,519.8459,200.401,2.594,0.010,125.964,913.728
bp,324.3846,197.053,1.646,0.100,-62.917,711.687
s1,-792.1756,1255.052,-0.631,0.528,-3258.944,1674.593
s2,476.7390,1021.170,0.467,0.641,-1530.341,2483.819
s3,101.0433,640.151,0.158,0.875,-1157.155,1359.242
s4,177.0632,486.370,0.364,0.716,-778.883,1133.009
s5,751.2737,517.768,1.451,0.148,-266.384,1768.931

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,0.223
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,21.7


In [32]:
model.tvalues

age   -0.055621
sex   -1.300494
bmi    2.594032
bp     1.646180
s1    -0.631189
s2     0.466856
s3     0.157843
s4     0.364051
s5     1.450985
s6     0.340266
dtype: float64

In [28]:
# Decision rule
for feature_name, t_val in model.tvalues.items():
    if abs(t_val) > t_score_95:
        print("Feature with significant coefficient :", feature_name)

Feature with significant coefficient is : bmi


# p_values sorted

In [41]:
model.pvalues

age    0.955670
sex    0.194125
bmi    0.009808
bp     0.100454
s1     0.528251
s2     0.640839
s3     0.874654
s4     0.715998
s5     0.147510
s6     0.733822
dtype: float64

In [46]:
p_values = model.pvalues.items()

p_values_sorted = sorted(p_values, key=lambda x: x[1])

p_values_sorted[0]

('bmi', 0.009808118174027873)

# Significant Variables 

In [None]:
# Let's calculate the Test Statistic 
confidence_interval = 0.95
alpha = 1- confidence_interval
t_score_95 = t.ppf(1-(alpha/2), df = X.shape[1]) # degreed of freedom is the number of variable
t_score_95

In [47]:
# Decision rule _ method 1
for feature_name, p_values in p_values_sorted:
    if p_values < alpha :
        print("Feature with significant coefficient :", feature_name)

Feature with significant coefficient : bmi


In [48]:
# Decision rule _ method 2 
for feature_name, t_val in model.tvalues.items():
    if abs(t_val) > t_score_95:
        print("Feature with significant coefficient :", feature_name)

Feature with significant coefficient : bmi
