<h2>乳がんデータをもとにロジスティック回帰を使ってモデルの構築を行う</h2>

In [7]:
import numpy as np 
import numpy.random as random 
import scipy as sp 
from pandas import Series,DataFrame 
import pandas as pd 

import matplotlib.pyplot as plt 
import matplotlib as mpl 
import seaborn as sns 
%matplotlib inline 

import sklearn 

%precision 3

import requests,zipfile 
import io 

from sklearn.datasets import load_breast_cancer 

data = load_breast_cancer()



In [8]:
df = pd.DataFrame(data.data,columns=data.feature_names)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

説明変数
1. mean radius 平均半径
2. mean texture テクスチャをグレースケールにした際の平均
3. mean perimeter 平均外周の長さ
4. mean area 平均面積
5. mean smoothness 平均なめらかさ（半径の分散）
6. mean compactness 外周長さ^2 / 面積 - 1.0で示すコンパクトさ平均
7. mean concavity 輪郭の凹部の重要度の平均
8. mean concave points 輪郭の凹部の数の平均
9. mean symmetry 対称性
10. mean fractal dimension フラクタル次元の平均
11. radius error 半径誤差
12. texture error テクスチャの誤差
13. perimeter error 外周の誤差
14. area error 面積の誤差
15. smoothness error なめらかさの誤差
16. compactness error コンパクトさの誤差
17. concavity error 輪郭の凹部の重要度の誤差
18. concave points error 輪郭の凹部の数の誤差
19. symmetry error 対称性の誤差
20. fractal dimension error フラクタル次元の誤差
21. worst radius 半径最悪値
22. worst texture テクスチャ最悪値
23. worst perimeter 外周の長さ最悪値
24. worst area 面積の最悪値
25. worst smoothness なめらかさの最悪値
26. worst compactness コンパクトさの最悪値
27. worst concavity 輪郭の凹部の重要度の最悪値
28. worst concave points 輪郭の凹部の数の最悪値
29. worst symmetry 対称性の最悪値
30. worst fractal dimension フラクタル次元の最悪値

In [10]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [11]:
df_target = pd.DataFrame(data.target,columns=['target'])

In [12]:
df_target

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [15]:
df_target.groupby('target').size()

target
0    212
1    357
dtype: int64

In [24]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')

In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 

X = df[['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension']]
y = df_target['target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5)

In [26]:
#標準化sc 
sc =StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std =sc.transform(X_test)

In [30]:
model = LogisticRegression()
model.fit(X_train_std,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
model.score(X_train_std,y_train)

0.9964788732394366

In [29]:
model.score(X_test_std,y_test)

0.9649122807017544