Best Subset: (('X2', 'X3', 'X4'), 1.2506454789833872)


In [4]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame
data = pd.DataFrame(data=iris['data'], columns=iris['feature_names'])
data['target'] = iris['target']

X = data.drop('target', axis=1)
y = data['target']


In [5]:
import itertools
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def best_subset_selection(X, y):
    results = []
    for k in range(1, len(X.columns) + 1):
        for combo in itertools.combinations(X.columns, k):
            X_subset = X[list(combo)]
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
            model = LogisticRegression(max_iter=200).fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            results.append((combo, accuracy))
    best_model = max(results, key=lambda x: x[1])
    return best_model

best_model = best_subset_selection(X, y)
print("Best Subset:", best_model)


Best Subset: (('petal length (cm)',), 1.0)


In [6]:
from sklearn.metrics import r2_score

def forward_selection(X, y):
    remaining = list(X.columns)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining:
        scores_with_candidates = []
        for candidate in remaining:
            X_subset = X[selected + [candidate]]
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3, random_state=42)
            model = LinearRegression().fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort(reverse=True)
        best_new_score, best_candidate = scores_with_candidates[0]
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
        else:
            break
    return selected, current_score

selected_features, score = forward_selection(X, y)
print("Selected features by forward selection:", selected_features)


Selected features by forward selection: ['petal width (cm)', 'sepal width (cm)']


In [7]:
import statsmodels.api as sm

def backward_elimination(X, y):
    X = sm.add_constant(X)  # adding a constant
    while len(X.columns) > 1:
        model = sm.OLS(y, X).fit()
        p_values = model.pvalues
        max_p_value = p_values.idxmax()
        if p_values[max_p_value] > 0.05:  # If p-value is greater than 0.05, remove the feature
            X = X.drop(max_p_value, axis=1)
        else:
            break
    return X.columns

selected_features = backward_elimination(X, y)
print("Selected features by backward elimination:", selected_features)


Selected features by backward elimination: Index(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'], dtype='object')
