# **This example demonstrates how to use AIC or BIC to make selection of a model**
We first do it manually. Then check out a quick approach to estimate them.

Mount Google Drive to the colab.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Load the data

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import scipy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
import matplotlib.pyplot as plt
scaler = StandardScaler()

# 1. Load dataset
df = pd.read_csv("/content/gdrive/MyDrive/ESI5685/Phoneme Recognition.txt")

df = df[(df.g == 'aa') | (df.g == 'ao')] #filtering
df.g = pd.get_dummies(df.g)['ao'] # convert aa or ao into dummies. 1- ao, 0-not ao but aa
target = 'g' # which is equivalent to t
features = [f'x.{i+1}' for i in range(256)]
X, y = df[features].values, df[target].values # X size 1717X256, y is (1717,)

In [None]:
df.head() # return the first 5 rows

In [None]:
df = pd.read_csv("/content/gdrive/MyDrive/ESI5685/Phoneme Recognition.txt")
df.head()

1. Reduce features in the model by selecting the first p features by multipling H with X.
2. Manually estimate the corresponding AIC and BIC
3. Determine which size of features leads to the lowest AIC and BIC, respectively.

In [None]:
# 2. Implement dimensionality reduction
def H_matrix(p, total_features):
    """Generate the projection matrix H that selects the first p features."""
    H = np.
    for i in range(p):
      H[i,i] =

    return H

total_features = 256 # all features in x1, x2... x256 in original dataset

# 3. Train a logistic regression model and compute AIC
aic_values = {}
bic_values = {}
possible_p_values = range(1, 256)

for p in possible_p_values:
    X_reduced =  #X: 1717X256, H: pX256
    # This implementation will take quite a while. We can use solver='lbfgs' method to reduce computation
    # resrouce for logistic regression. It is a
    # quasi-Newton methods that approximates the Broyden-Fletcher-Goldfarb-Shanno (BFGS)
    # algorithm using a limited amount of computer memory
    # We need to set up a maximum iteration to avoid a warning of the solver convergence.
    model =

    # Predict probabilities
    probas =

    # Calculate log likelihood
    # log_loss = y*log(probas)+(1-y)*log(1-probas)
    loglikelihood =  # log_loss X n is equivalent to not to normalize it.

    # Calculate AIC = 2k - 2ln(L), where k is number of parameters and L is likelihood of the model.
    # The +1 accounts for the intercept term.
    aic =
    aic_values[p] =
    # Calculate BIC = k * ln(n) - 2 * ln(L)
    bic =
    bic_values[p] =

# 4. Select optimal p using AIC

# Get the best key p (the one that gives the smallest AIC) from dictionary best_p_aic or best_p_bic
# selects the key (best_p_aic/bic) for which aic_values.get returns the smallest value.
# tell min to extract a comparison key from each input element
best_p_aic = min( # aic_values and bic_values are dictionaries with p as keys.
# Select optimal p using BIC
best_p_bic = min(

fig, ax = plt.subplots(2, 1, figsize=(10, 12))

# AIC subplot
ax[0].plot(, , marker='o', linestyle='-')
ax[0].scatter(, , color='red', s=100, zorder=5)
ax[0].set_title('AIC values vs. p')
ax[0].set_xlabel('p')
ax[0].set_ylabel('AIC')
ax[0].grid(True)
ax[0].set_xticks(list(aic_values.keys()))
for i, v in aic_values.items():
    ax[0].text(i, v + 0.5, "%d" %v, ha='center')

# BIC subplot
ax[1].plot(, , marker='o', linestyle='-')
ax[1].scatter(, , color='red', s=100, zorder=5)
ax[1].set_title('BIC values vs. p')
ax[1].set_xlabel('p')
ax[1].set_ylabel('BIC')
ax[1].grid(True)
ax[1].set_xticks(list(bic_values.keys()))
for i, v in bic_values.items():
    ax[1].text(i, v + 0.5, "%d" %v, ha='center')

plt.tight_layout()
plt.show()

 # Inside the format string (f-string), expressions inside {}
# are evaluated at runtime and then formatted using the format string syntax.
print(f"The best p value according to AIC is: { } with an AIC of { }")
print(f"The best p value according to BIC is: { } with a BIC of { }")


# **Statsmodels library provides a quick way to estimate AIC and BIC**




In [None]:
import numpy as np
import statsmodels.api as sm

# Creating synthetic data
np.random.seed(42)
N = 100 # Number of samples
X = np.random.randn(N, 5) # 5 features X0, X1, X2...X4
# Ground truth: Only first 3 features X0, X1, X2 are related to y
y = X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.normal(0, 0.5, N)

# Now we want to test if X0, X1... X4 should be included in the model
# Defining function to fit linear model and return AIC and BIC

# Comparing models with different subsets of features
for i in range(1, 6):
    # adds a column of ones to your input dataset X.
    # This is done to include an intercept in the regression model. like np.column_stack((one, X))
    X = sm.addConstant( # 100X6
    # Fit a linear regression model using the Ordinary Least Squares (OLS)
    # or Logistic Regression (Logit) method.

    model = # fit model based on first i columns of X

    print(f"Model with first {i} features: AIC = { }, BIC = {}")

# Question: What conclusion can you draw based on the AIC and BIC results?