In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
bc = load_breast_cancer()

### データセットの中身確認

In [2]:
print("{}".format(bc.keys())) # dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

print("{}".format(bc.data.shape)) # 被験者数 569 人，検査項目 30 個

print("{}".format(bc.data[:5, :3])) # 最初の 5 人の最初の 3 項目だけ値を表示

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [9]:
print("{}".format(bc.feature_names)) # 検査項目の名称: 半径，質感，外周，面積…

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [10]:
print("{}".format({n:v for n, v in zip(bc.target_names, np.bincount(bc.target))}))

{'malignant': 212, 'benign': 357}


In [13]:
print("{}".format(bc.target[:5]))

[0 0 0 0 0]


### データセットをトレーニング用とテスト用に分割

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, random_state = 1)

In [15]:
print("{}".format(X_train.shape))
print("{}".format(y_test.shape))

(426, 30)
(143,)


### 前処理: 正規化（各項目の値域を 0~1 に統一）

In [16]:
print("Min:{}".format(X_train.min(axis=0)[:3]))
print("Max:{}".format(X_train.max(axis=0)[:3]))

Min:[ 6.981  9.71  43.79 ]
Max:[ 28.11  39.28 188.5 ]


In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_s = scaler.fit(X_train).transform(X_train)

print("Min:{}".format(X_train_s.min(axis=0)[:3]))
print("Max:{}".format(X_train_s.max(axis=0)[:3]))

X_test_s = scaler.transform(X_test)

Min:[0. 0. 0.]
Max:[1. 1. 1.]


### ロジスティック回帰/トレーニング

In [19]:
from sklearn.linear_model import LogisticRegression as LR

In [21]:
logred = LR(C=1, penalty = "l1").fit(X_train_s, y_train)



### テスト

In [22]:
y_pred = logred.predict(X_test_s)

### 評価：混同行列，正答率，適合率，再現率

In [23]:
from sklearn.metrics import confusion_matrix as CM

In [24]:
print("{}".format(CM(y_test,y_pred)))

[[49  6]
 [ 1 87]]


In [25]:
from sklearn.metrics import classification_report as CR
print("{}".format(CR(y_test, y_pred, target_names = bc.target_names)))

              precision    recall  f1-score   support

   malignant       0.98      0.89      0.93        55
      benign       0.94      0.99      0.96        88

   micro avg       0.95      0.95      0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143

