In [2]:
import itertools
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample data
np.random.seed(5)
X = pd.DataFrame(2+2*np.random.randn(100, 5), columns=['X1', 'X2', 'X3', 'X4', 'X5'])
y = 1.5*X['X1'] + X['X3'] + 2 * X['X5'] + np.random.randn(100)

In [5]:
# Define the function to calculate Mallows' Cp
def calculate_mallows_cp(X, y, model, residual_sum_of_squares, sigma_squared):
    n = len(y)
    p = X.shape[1]  # Number of predictors in the model
    cp = residual_sum_of_squares / sigma_squared   - (n - 2 * p) # sigma_squared - vay(y)
    return cp

# Best subset selection using Mallows' Cp
def best_subset_selection_mallows_cp(X, y):
    n_features = X.shape[1]
    model = LinearRegression()
    model.fit(X, y)
    n = len(y)
    predictions = model.predict(X)
    sigma_squared = np.sum((y - predictions) ** 2)/(n-n_features-1)
    best_cp = float('inf')
    best_subset = None
    all_combinations = []

    # Iterate over all possible feature subsets
    for k in range(1, n_features + 1):
        for subset in itertools.combinations(range(n_features), k):
            X_subset = X.iloc[:, list(subset)]
            model = LinearRegression()
            model.fit(X_subset, y)
            predictions = model.predict(X_subset)
            residual_sum_of_squares = np.sum((y - predictions) ** 2)
            cp = calculate_mallows_cp(X_subset, y, model, residual_sum_of_squares, sigma_squared)
            all_combinations.append((subset, cp))

            # Update best subset if the current Cp is better
            if cp < best_cp:
                print(cp-best_cp)
                best_cp = cp
                best_subset = subset

    return best_subset, best_cp, all_combinations

In [6]:
# Find the best subset using Mallows' Cp
best_subset, best_cp, all_combinations = best_subset_selection_mallows_cp(X, y)
# Output the best subset
print(f'Best subset: {best_subset}')
print(f'Best Mallows\' Cp: {best_cp}')

-inf
-621.5337367979432
-724.4866065373296
-410.03788025545
Best subset: (0, 2, 4)
Best Mallows' Cp: 1.9466260283789723
