In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm


from sklearn.datasets import make_regression
X_poly, y_poly = make_regression(n_samples=1000, n_features=5, noise=10, random_state=42)
df_poly = pd.DataFrame(X_poly, columns=[f"Feature_{i}" for i in range(1, 6)])
df_poly["Target"] = y_poly


degree = 2
poly = PolynomialFeatures(degree=degree, include_bias=False)
X_poly_transformed = poly.fit_transform(df_poly.drop(columns=["Target"]))
X_poly_df = pd.DataFrame(X_poly_transformed, columns=poly.get_feature_names_out())

y = df_poly["Target"]
X = X_poly_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_const = sm.add_constant(X_train)


model_all = sm.OLS(y_train, X_train_const).fit()
print(model_all.summary())


def backward_elimination(X, y, significance_level=0.05):
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    while True:
        p_values = model.pvalues
        max_p_value = p_values.max()
        if max_p_value > significance_level:
            feature_to_remove = p_values.idxmax()
            X = X.drop(columns=[feature_to_remove])
            model = sm.OLS(y, X).fit()
        else:
            break
    return model, X.columns

model_backward, selected_features_backward = backward_elimination(X_train, y_train)
print(model_backward.summary())


def forward_selection(X, y, significance_level=0.05):
    initial_features = []
    remaining_features = list(X.columns)
    best_model = None
    best_features = None

    while remaining_features:
        p_values = {}
        for feature in remaining_features:
            X_temp = sm.add_constant(X[initial_features + [feature]])
            model = sm.OLS(y, X_temp).fit()
            p_values[feature] = model.pvalues[feature]

        best_feature = min(p_values, key=p_values.get)
        if p_values[best_feature] < significance_level:
            initial_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_model = sm.OLS(y, sm.add_constant(X[initial_features])).fit()
            best_features = initial_features[:]
        else:
            break

    return best_model, best_features

model_forward, selected_features_forward = forward_selection(X_train, y_train)
print(model_forward.summary())


def bidirectional_selection(X, y, significance_level=0.05):
    selected_features = []
    remaining_features = list(X.columns)
    best_model = None

    while remaining_features:
        # Forward step
        p_values = {}
        for feature in remaining_features:
            X_temp = sm.add_constant(X[selected_features + [feature]])
            model = sm.OLS(y, X_temp).fit()
            p_values[feature] = model.pvalues[feature]
        
        best_feature = min(p_values, key=p_values.get)
        if p_values[best_feature] < significance_level:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        
        
        model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
        p_values = model.pvalues.drop("const", errors="ignore")
        for feature, p_value in p_values.items():
            if p_value > significance_level:
                selected_features.remove(feature)
                remaining_features.append(feature)

        best_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
    
    return best_model, selected_features

model_bidirectional, selected_features_bidirectional = bidirectional_selection(X_train, y_train)
print(model_bidirectional.summary())


                            OLS Regression Results                            
Dep. Variable:                 Target   R-squared:                       0.976
Model:                            OLS   Adj. R-squared:                  0.975
Method:                 Least Squares   F-statistic:                     1579.
Date:                Mon, 10 Mar 2025   Prob (F-statistic):               0.00
Time:                        23:33:50   Log-Likelihood:                -2995.6
No. Observations:                 800   AIC:                             6033.
Df Residuals:                     779   BIC:                             6132.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -0.9013    