In [1]:
# Common imports
import csv
import os
import sys
# Scikit-Learn ≥0.20
import sklearn
# numpy matplotlib
import numpy as np
import matplotlib.pyplot as plt 

In [4]:
with open('./breast_cancer_data/breast_cancer.data') as csv_file:
    bc_data_norm = np.loadtxt(csv_file, delimiter=",")

print("bc_data_norm shape, dtype = ",
      bc_data_norm.shape,
      bc_data_norm.dtype
)

bc_data_norm shape, dtype =  (569, 31) float64


In [5]:
# extract the first 30 columns as features X
X = bc_data_norm[:, :-1]
print("X.shape, X.dtype = ", X.shape, X.dtype)
print("X[0] = ", X[0])

# extract the last column as label y
y = bc_data_norm[:, -1:].astype(np.int)
# a 1d array was expected. Please change the shape of y to (n_samples, )
y = y.ravel()
print("y.shape, y[:5], y.dtype = ", y.shape, y[:5], y.dtype)
print("=== X and y extraction done ===")

X.shape, X.dtype =  (569, 30) float64
X[0] =  [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.870e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.400e-03 4.900e-02 5.370e-02 1.590e-02 3.000e-02 6.200e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
y.shape, y[:5], y.dtype =  (569,) [0 0 0 0 0] int64
=== X and y extraction done ===


In [6]:
# Class Distribution: 212 - Malignant, 357 - Benign
np.testing.assert_equal(np.count_nonzero(y==1), 357)
# benign class is 1
np.testing.assert_equal(np.count_nonzero(y==0), 212)
# malignant class is 0

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

# split the train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

In [9]:
print("X_train.shape, X_test.shape = ", X_train.shape, X_test.shape)
print("y_train.shape, y_test.shape = ", y_train.shape, y_test.shape)

X_train.shape, X_test.shape =  (455, 30) (114, 30)
y_train.shape, y_test.shape =  (455,) (114,)


In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X, y)

print("x_test's first 3 predicted probabilities = \n", clf.predict_proba(X_test)[:3])

# get actual predicted class
y_pred = clf.predict(X_test)

# show the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("cm =\n", cm)
target_names = ['Benign', 'Malignant']
print(classification_report(y_test, y_pred, target_names=target_names))

def regular_accuracy(y_true, y_pred):
    regular_accuracy = np.sum(y_true == y_pred) / len(y_true)
    return regular_accuracy

print("LogReg regular accuracy:", regular_accuracy(y_test, y_pred))

x_test's first 3 predicted probabilities = 
 [[1.92681816e-01 8.07318184e-01]
 [1.00000000e+00 7.17433091e-12]
 [9.99704446e-01 2.95554106e-04]]
cm =
 [[41  2]
 [ 1 70]]
              precision    recall  f1-score   support

      Benign       0.98      0.95      0.96        43
   Malignant       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

LogReg regular accuracy: 0.9736842105263158


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
