In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score, confusion_matrix, roc_auc_score

data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=42)
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [2]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
rfe = RFE(lr, n_features_to_select=2)
rfe = rfe.fit(X_train, y_train)

X_train = X_train[:, rfe.support_]
X_test = X_test[:, rfe.support_]
X_train.shape

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(398, 2)

In [3]:
lr = LogisticRegression(solver='lbfgs', max_iter=10000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Able to classify model correctly ~86%
print("Accuracy:", accuracy_score(y_test, y_pred))
# Able to predict correctly ~84%
print("Precision:", precision_score(y_test, y_pred))
# Able to correctly identify ~95% all positive instances
print("Recall:", recall_score(y_test, y_pred))
# Performs well between precision and recall ~89%
print("F1:", f1_score(y_test, y_pred))
# Able to identify ~83% correctly, therfore 33% above guessing
print("AUC:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8596491228070176
Precision: 0.8442622950819673
Recall: 0.9537037037037037
F1: 0.8956521739130434
AUC: 0.8260582010582009
Confusion Matrix:
[[ 44  19]
 [  5 103]]


In [4]:
print('True/False Selection of features:', rfe.support_)

True/False Selection of features: [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True False False]


In [5]:
feature_indices = np.arange(30)
display(feature_indices)
feature_indices = feature_indices[rfe.support_]
display(feature_indices)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

array([26, 27])

In [12]:
plt.scatter(X_train[:,feature_indices[26]], X_train[:,feature_indices[27]], c = y_train)
plt.xlabel(data.feature_names[feature_indices[26]])
plt.ylabel(data.feature_names[feature_indices[27]])

IndexError: index 26 is out of bounds for axis 0 with size 2

In [9]:
print('The best features are : ', np.array(data.feature_names)[rfe.support_])

The best features are :  ['worst concavity' 'worst concave points']
