In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import zero_one_loss

df = pd.read_csv('./breast-cancer-wisconsin.csv')
print(df.head())

# drop
df = df[(df.astype(str) != '?').all(axis=1)].astype(int)

# features and class
X = df.iloc[:, 1: -1]
y = df['Class']
# feature normalization
X = (X - X.mean(axis=0)) / X.std(axis=0)
# split training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Logistic regression prediction
regr = LogisticRegression(penalty='l2', multi_class='multinomial' ,solver='lbfgs', C=1000, random_state=0)
regr.fit(X_train, y_train)

y_train_predict = regr.predict(X_train)
y_test_predict = regr.predict(X_test)

# evaluate error
train_err = (y_train_predict != y_train)
train_errs = np.sum(train_err)
test_err = (y_test_predict != y_test)
test_errs = np.sum(test_err)
print('\nIn training set, made {} errors out of {}, on instances {}'.format(train_errs, len(y_train), np.where(train_err)))
print('\nIn testing set, made {} errors out of {}, on instances {}'.format(test_errs, len(y_test), np.where(test_err)))

# error rate
train_err_rate = zero_one_loss(y_train, y_train_predict)
test_err_rate = zero_one_loss(y_test, y_test_predict)
print('\nError rates in training set: {:0.3f}'.format(train_err_rate))
print('\nError rates in testing set: {:0.3f}'.format(test_err_rate))

   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0           1                3                1        1