# Feature selection

Using variance inflation factor (VIF) to check for multicollinearity between the variables. Values of 5 or higher indicate a high correlation. Features with high correlation are removed from the model.

In [None]:
import statsmodels.api as sm
from sklearn.datasets import load_breast_cancer
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor


breast_cancer = load_breast_cancer()
X = breast_cancer.data
feature_names = breast_cancer.feature_names.tolist()

# Computre and return VIF values
def compute_vif(X):
    return [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Remove features one by one until all VIF values are under 5
while True:
    vifs = compute_vif(X)
    max_vif = max(vifs)
    
    # Once all VIF values are under 5, it's done
    if max_vif < 5:
        break

    # Remove the feature with the highest VIF
    max_index = vifs.index(max_vif)
    feature_to_remove = feature_names[max_index]
    print(f"Removing: {feature_to_remove} with VIF = {max_vif}")
    
    # Remove the column and its corresponding feature name
    X = np.delete(X, max_index, axis=1)
    feature_names.pop(max_index)

# Final result
print("\nBest Features:")
print(feature_names)


Removing: mean radius with VIF = 63306.17203588469
Removing: worst radius with VIF = 7573.943486033555
Removing: mean perimeter with VIF = 3901.901687119607
Removing: worst perimeter with VIF = 668.3854404127386
Removing: mean fractal dimension with VIF = 508.08682464149285
Removing: worst smoothness with VIF = 368.0533791867144
Removing: worst texture with VIF = 309.54444960438434
Removing: worst fractal dimension with VIF = 184.67972071700538
Removing: worst symmetry with VIF = 167.30971478504884
Removing: mean concavity with VIF = 142.29904340088856
Removing: radius error with VIF = 104.99215955661566
Removing: worst concave points with VIF = 100.94649021325061
Removing: mean smoothness with VIF = 86.99658368431041
Removing: mean compactness with VIF = 74.72314541276282
Removing: mean area with VIF = 67.47169344522449
Removing: worst compactness with VIF = 49.02308700997905
Removing: perimeter error with VIF = 43.72833047786977
Removing: mean symmetry with VIF = 36.0757931560618
Rem

# Selected Features

4 features were chosen based on VIF. The following features were the only features with a VIF value less than 5.

texture error (index 11)
area error (index 13)
concavity error (index 16)
worst concavity (index 26)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Indicies of features chosen by VIF
features = (11, 13, 16, 26)

# Load data
breast_cancer = load_breast_cancer()

# Features from VIF
X = breast_cancer.data[:, list(features)]
y = breast_cancer.target

avgScore = 0
runs = 20

for i in range(runs):
    # Split data into 75:25
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

    # Scale data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    # Train the logistic regression model
    model = LogisticRegression(max_iter=15)
    model.fit(X_train, y_train)

    # Evaluate the model on the test data
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    avgScore += score

# getting average score across all runs
print("Accuracy:", avgScore/runs)

Accuracy: 0.9395104895104895
