In [1]:
## Import Libraries

import pandas as pd # type: ignore
import statsmodels.api as sm # type: ignore

from statsmodels.tools.tools import add_constant # type: ignore

In [2]:
## Build dataframe from Excel

df = pd.read_excel("Squid.xlsx")
df

## 1.⁠ ⁠x1: rostral lenght
## 2.⁠ ⁠x2: fin lenght (in inchs)
## 3. x3: lenght from rostral to tail (in inchs)
## 4.⁠ ⁠x4: lenght from tail to fin (in inchs)
## 5.⁠ ⁠x5: width (in inchs)
## 6.⁠ ⁠x6: weight (in pounds)

Unnamed: 0,x1,x2,x3,x4,x5,x6
0,1.31,1.07,0.44,0.75,0.35,1.95
1,1.55,1.49,0.53,0.9,0.47,2.9
2,0.99,0.84,0.34,0.57,0.32,0.72
3,0.99,0.83,0.34,0.54,0.27,0.81
4,1.01,0.9,0.36,0.64,0.3,1.09
5,1.09,0.93,0.42,0.61,0.31,1.22
6,1.08,0.9,0.4,0.51,0.31,1.02
7,1.27,1.08,0.44,0.77,0.34,1.93
8,0.99,0.85,0.36,0.56,0.29,0.64
9,1.34,1.13,0.45,0.77,0.37,2.08


In [3]:
## Correlation matrix to detect relationship between variables

df.corr()

Unnamed: 0,x1,x2,x3,x4,x5,x6
x1,1.0,0.971222,0.957112,0.974533,0.953498,0.959261
x2,0.971222,1.0,0.968355,0.972323,0.946946,0.938739
x3,0.957112,0.968355,1.0,0.94159,0.97191,0.956024
x4,0.974533,0.972323,0.94159,1.0,0.922871,0.941132
x5,0.953498,0.946946,0.97191,0.922871,1.0,0.97237
x6,0.959261,0.938739,0.956024,0.941132,0.97237,1.0


In [4]:
## First model: add all variables plus constant

y = df["x6"]
X = df.drop(columns = ["x6"])
X = sm.add_constant(X)

# Create Regression Model with Statsmodels
model = sm.OLS(y, X).fit()
print(model.summary())

## prueba Jarque - Bera
# H0: Residuals Normality
# H1: Not Normality

# J-B = 1.668
# Prob(JB): p-value

                            OLS Regression Results                            
Dep. Variable:                     x6   R-squared:                       0.963
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                     83.56
Date:                Fri, 09 Aug 2024   Prob (F-statistic):           6.89e-11
Time:                        22:20:19   Log-Likelihood:                -20.039
No. Observations:                  22   AIC:                             52.08
Df Residuals:                      16   BIC:                             58.62
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.4443      0.910     -7.084      0.0

In [5]:
## Second model: we only take variables with lowest p-value (<= 0.05): const + x5

y = df["x6"]
X = df["x5"]
X = sm.add_constant(X)

# Create Regression Model with Statsmodels
model = sm.OLS(y, X).fit()
print(model.summary())

## The equation is y = -5.4843+ 20.8566 x5

                            OLS Regression Results                            
Dep. Variable:                     x6   R-squared:                       0.946
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     347.0
Date:                Fri, 09 Aug 2024   Prob (F-statistic):           4.18e-14
Time:                        22:20:22   Log-Likelihood:                -24.334
No. Observations:                  22   AIC:                             52.67
Df Residuals:                      20   BIC:                             54.85
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.4843      0.545    -10.068      0.0

In [6]:
## FUNCTIONS TO SELECT AUTOMATIC MODELS: FORWARD & BACKWARD

# Function to build the best model trought Forward Selection: include variables one by one until all are added
def fnForwardSelection(data, target, significance_level = 0.05):
    initial_features = []
    remaining_features = list(data.columns)
    best_features = []
    while remaining_features:
        remaining_p_values = pd.Series(index=remaining_features)
        for feature in remaining_features:
            model = sm.OLS(target, add_constant(data[initial_features + [feature]])).fit()
            remaining_p_values[feature] = model.pvalues[feature]
        min_p_value = remaining_p_values.min()
        if min_p_value < significance_level:
            best_feature = remaining_p_values.idxmin()
            initial_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_features.append(best_feature)
        else:
            break
    return best_features

def fnBackwardSelection(data, target, significance_level = 0.05):
    features = list(data.columns)
    while len(features) > 0:
        model = sm.OLS(target, add_constant(data[features])).fit()
        max_p_value = model.pvalues.max()  # Obtener el valor de p más alto
        if max_p_value > significance_level:
            excluded_feature = model.pvalues.idxmax()
            features.remove(excluded_feature)
        else:
            break
    return features

In [7]:
# Third model throught Forward Selection method

df = pd.read_excel("Squid.xlsx")

y = df["x6"]
X = df.drop(columns = ["x6"])

best_features = fnForwardSelection(X, y)
X_best = add_constant(X[best_features])
model = sm.OLS(y, X_best).fit()

print(model.summary())

## The best features are {cont, x4, x5}. The equation is y = -6.3351 + 4.1542 x4 + 15.0160 x5

                            OLS Regression Results                            
Dep. Variable:                     x6   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     218.9
Date:                Fri, 09 Aug 2024   Prob (F-statistic):           7.58e-14
Time:                        22:20:33   Log-Likelihood:                -21.359
No. Observations:                  22   AIC:                             48.72
Df Residuals:                      19   BIC:                             51.99
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3351      0.601    -10.543      0.0

In [8]:
# Third model throught Backward Selection method

df = pd.read_excel("Squid.xlsx")

y = df["x6"]
X = df.drop(columns = ["x6"])

best_features = fnBackwardSelection(X, y)
X_best = add_constant(X[best_features])
model = sm.OLS(y, X_best).fit()

print(model.summary())

## The best features are {cont, x4, x5}. The equation is y = -6.3351 + 4.1542 x4 + 15.0160 x5

                            OLS Regression Results                            
Dep. Variable:                     x6   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     218.9
Date:                Fri, 09 Aug 2024   Prob (F-statistic):           7.58e-14
Time:                        22:23:31   Log-Likelihood:                -21.359
No. Observations:                  22   AIC:                             48.72
Df Residuals:                      19   BIC:                             51.99
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3351      0.601    -10.543      0.0

In [None]:
### IN THIS EXERCISE, BOTH METHODS HAVE THA SAME EQUATION