# Feature selection

Using variance inflation factor (VIF) to check for multicollinearity between the variables. Values of 5 or higher indicate a high correlation. Features with high correlation are removed from the model.

In [2]:
import statsmodels.api as sm
from sklearn.datasets import load_breast_cancer
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor


breast_cancer = load_breast_cancer()
X = breast_cancer.data
feature_names = breast_cancer.feature_names.tolist()

# Computre and return VIF values
def compute_vif(X):
    return [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Remove features one by one until all VIF values are under 5
while True:
    vifs = compute_vif(X)
    max_vif = max(vifs)
    
    # Once all VIF values are under 5, it's done
    if max_vif < 5:
        break

    # Remove the feature with the highest VIF
    max_index = vifs.index(max_vif)
    feature_to_remove = feature_names[max_index]
    print(f"Removing: {feature_to_remove} with VIF = {max_vif}")
    
    # Remove the column and its corresponding feature name
    X = np.delete(X, max_index, axis=1)
    feature_names.pop(max_index)

# Final result
print("\nBest Features:")
print(feature_names)


Removing: mean radius with VIF = 63306.17203588469
Removing: worst radius with VIF = 7573.943486033555
Removing: mean perimeter with VIF = 3901.901687119607
Removing: worst perimeter with VIF = 668.3854404127386
Removing: mean fractal dimension with VIF = 508.08682464149285
Removing: worst smoothness with VIF = 368.0533791867144
Removing: worst texture with VIF = 309.54444960438434
Removing: worst fractal dimension with VIF = 184.67972071700538
Removing: worst symmetry with VIF = 167.30971478504884
Removing: mean concavity with VIF = 142.29904340088856
Removing: radius error with VIF = 104.99215955661566
Removing: worst concave points with VIF = 100.94649021325061
Removing: mean smoothness with VIF = 86.99658368431041
Removing: mean compactness with VIF = 74.72314541276282
Removing: mean area with VIF = 67.47169344522449
Removing: worst compactness with VIF = 49.02308700997905
Removing: perimeter error with VIF = 43.72833047786977
Removing: mean symmetry with VIF = 36.0757931560618
Rem

# Selected Features

4 features were chosen based on VIF. The following features were the only features with a VIF value less than 5.

texture error (index 11)
area error (index 13)
concavity error (index 16)
worst concavity (index 26)

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Indicies of features chosen by VIF
features = (11, 13, 16, 26)

# Load data
breast_cancer = load_breast_cancer()

# Features from VIF
X = breast_cancer.data[:, list(features)]
y = breast_cancer.target

# Split data into 75:25
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Train the logistic regression model
model = LogisticRegression(max_iter=15)
model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)

print("Accuracy:", score)

Accuracy: 0.9790209790209791


## Random Forest

In [27]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# referencing https://www.geeksforgeeks.org/feature-selection-using-random-forest/

# Training without modifying parameters

# Loading breast cancer data
breast_cancer = load_breast_cancer()

X = breast_cancer.data
y = breast_cancer.target

# Split data into 75:25
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

# Training random forest model
model1 = RandomForestClassifier(random_state=123)
model1.fit(X_train, y_train)

print("Accuracy (before feature selection): ", model1.score(X_test, y_test))





Accuracy (before feature selection):  0.993006993006993


In [127]:
# Testing different hyperparameters


# Loading breast cancer data
breast_cancer = load_breast_cancer()

X = breast_cancer.data
y = breast_cancer.target

# Split data into 75:25
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=123)

temp_model = RandomForestClassifier(random_state=123)

###
# Train model
temp_model.fit(X_train, y_train)

###

# Retrieve importance values (Gini Importance)
importance_values = temp_model.feature_importances_

# Sort in descending order
indicesOfSortedValues = np.argsort(importance_values)[::-1]

n = 13

# Select top n features
selectedFeaturesIndicies = indicesOfSortedValues[:n]


feature_names = breast_cancer.feature_names
top_feature_names = feature_names[selectedFeaturesIndicies]
print("Top 13 features:", top_feature_names)


# new x train with selected features
new_X_train = X_train[:, selectedFeaturesIndicies]
new_X_test = X_test[:, selectedFeaturesIndicies]

###

# parameters to modify
param_grid = {
    'max_features': [1, 2, 4, 6, 8, 10, 'sqrt', 'log2'],
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

model2 = RandomForestClassifier(random_state=123)

# grid search
grid_search = GridSearchCV(estimator=model2,
                           param_grid=param_grid,
                           cv=5,          # 5-fold cross-validation
                           n_jobs=-1,
                           verbose=1,
                           scoring='accuracy')

# fitting to different models
result = grid_search.fit(new_X_train, y_train)

# Best parameters and score
print("Best Params:", result.best_params_)

# Average accuracy of the 5 cross folds
print("Best CV Accuracy:", result.best_score_)

# Evaluate on test set
best_model = result.best_estimator_
print("Test Accuracy:", best_model.score(new_X_test, y_test))


Top 13 features: ['worst perimeter' 'worst radius' 'worst concave points' 'worst area'
 'mean concave points' 'mean concavity' 'mean area' 'area error'
 'mean perimeter' 'mean radius' 'worst concavity' 'worst texture'
 'mean texture']
Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Params: {'max_depth': 10, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV Accuracy: 0.9694938440492477
Test Accuracy: 0.993006993006993


In [109]:
from math import sqrt

def confusion_matrix(y_test, y_pred):
  tp, fp, fn, tn = 0, 0, 0, 0

  for i in range(len(y_test)):
    if y_test[i] == 0:
      if y_test[i] == y_pred[i]:
        tn += 1
      else:
        fp += 1
    else:
      if y_test[i] == y_pred[i]:
        tp += 1
      else:
        fn += 1

  return tp, fp, fn, tn


def MCC(tp, fp, fn, tn):
  numerator = (tp * tn) - (fp * fn)
  denom = sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
  return numerator / denom

def calculate_metrics(y_test, y_pred):
  # Calculate metrics
  tp, fp, fn, tn = confusion_matrix(y_test, y_pred)
  mcc = MCC(tp, fp, fn, tn)
  ppv = tp / (tp + fp)
  tpr = tp / (tp + fn)
  tnr = tn / (tn + fp)
  fpr = fp / (fp + tn)
  fnr = fn / (fn + tp)

  print("Accuracy:           ", accuracy_score(y_test, y_pred))
  print("True positive rate: ", tpr)
  print("True negative rate: ", tnr)
  print("False positive rate:", fpr)
  print("False negative rate:", fnr)
  print("Precision:          ", ppv)
  print("MCC:                ", mcc)

  return accuracy_score(y_test, y_pred)

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Final model from grid search
model3 = RandomForestClassifier(
    max_features=6,
    n_estimators=200,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=123
)

# Indicies of selected features from cell above
selectedFeaturesIndicies = [22, 20, 27, 23, 7, 6, 3, 13, 2, 0, 26, 21, 1]

# Load data
breast_cancer = load_breast_cancer()
X = breast_cancer.data[:, selectedFeaturesIndicies]
y = breast_cancer.target

# Split data into 75:25
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=123)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train model
model3.fit(X_train, y_train)

# Predict
y_pred = model3.predict(X_test)

# Evaluate
calculate_metrics(y_test, y_pred)



indicies:  [22 20 27 23  7  6  3 13  2  0 26 21  1]
Accuracy:            0.993006993006993
True positive rate:  1.0
True negative rate:  0.9814814814814815
False positive rate: 0.018518518518518517
False negative rate: 0.0
Precision:           0.9888888888888889
MCC:                 0.9851782233115197


0.993006993006993