In [2]:
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd
import time

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

import sklearn

%precision 3

'%.3f'

In [1]:
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV

In [63]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.Series(cancer.target, name='target')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

model = LogisticRegression()
clf = model.fit(X_train_std, y_train)
print(clf.__class__.__name__)
print('train:', clf.score(X_train_std, y_train))
print('test:', clf.score(X_test_std, y_test))

pred_y = clf.predict(X_test_std)
confusion_m = confusion_matrix(y_test, pred_y)
print('Confusion matrix:\n{}'.format(confusion_m))

LogisticRegression
train: 0.9906103286384976
test: 0.958041958041958
Confusion matrix:
[[50  3]
 [ 3 87]]


In [5]:
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


パラメータ多すぎる気がするから、penaltyをl1にして不要な特徴を消す

In [40]:
model = LogisticRegression(penalty='l1', C=2.0, solver='liblinear')
clf = model.fit(X_train_std, y_train)
print(clf.__class__.__name__)
print('train:', clf.score(X_train_std, y_train))
print('test:', clf.score(X_test_std, y_test))

pred_y = clf.predict(X_test_std)
confusion_m = confusion_matrix(y_test, pred_y)
print('Confusion matrix:\n{}'.format(confusion_m))

LogisticRegression
train: 0.9929577464788732
test: 0.965034965034965
Confusion matrix:
[[51  2]
 [ 3 87]]


↓解答のコピペ

In [59]:
from sklearn.cluster import KMeans
kmeans_pp = KMeans(n_clusters=5)
kmeans_pp.fit(X_train_std)
y_train_cl = kmeans_pp.fit_predict(X_train_std)

y_test_cl = kmeans_pp.fit_predict(X_test_std)

cl_train_data = pd.DataFrame(y_train_cl, columns=['cl_nm']).astype(str)
cl_train_data_dummy = pd.get_dummies(cl_train_data)
cl_train_data_dummy.head()

Unnamed: 0,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,1,0,0,0,0


In [60]:
cl_test_data = pd.DataFrame(y_test_cl, columns=['cl_nm']).astype(str)
cl_test_data_dummy = pd.get_dummies(cl_test_data)
cl_test_data_dummy.head()

Unnamed: 0,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4
0,0,1,0,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,1,0,0,0
4,0,0,1,0,0


In [84]:
#y_train
pd.DataFrame(list(y_train), columns=['flg'])

Unnamed: 0,flg
0,1
1,0
2,1
3,1
4,1
...,...
421,0
422,1
423,1
424,1


In [85]:
merge_train_data = pd.concat([
        pd.DataFrame(X_train_std),
        cl_train_data_dummy,
        pd.DataFrame(list(y_train), columns=['flg'])
    ], axis=1)
merge_test_data = pd.concat([
        pd.DataFrame(X_test_std),
        cl_test_data_dummy,
        pd.DataFrame(list(y_test), columns=['flg'])
    ], axis=1)
merge_train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,cl_nm_0,cl_nm_1,cl_nm_2,cl_nm_3,cl_nm_4,flg
0,-0.500746,-0.629604,-0.510598,-0.508655,-0.32677,-0.678037,-0.702917,-0.67329,-0.323201,-0.513532,...,-0.494471,-0.429224,-0.46502,-0.447715,1,0,0,0,0,1
1,0.948356,0.01107,0.931367,0.814498,-0.473158,0.297845,0.19152,0.649428,-1.114571,-1.117685,...,0.387699,1.175397,0.053685,-0.302163,0,1,0,0,0,0
2,-1.005023,-0.151387,-1.005709,-0.884654,0.755356,-0.706644,-0.840513,-0.798055,-1.203323,0.466252,...,-0.915127,-0.748055,-1.142683,-0.316267,0,0,1,0,0,1
3,-1.63426,0.326831,-1.551415,-1.243587,-0.159571,0.500562,0.556308,-0.699663,1.533191,2.838587,...,1.303103,-0.546019,0.712943,3.642956,0,0,0,1,0,1
4,-0.254149,-0.789772,-0.314642,-0.325885,-0.801097,-0.976997,-1.115819,-1.166748,-0.648624,-0.542097,...,-1.272052,-1.350424,-0.409803,-0.009932,1,0,0,0,0,1


In [86]:
from sklearn.metrics import confusion_matrix

model = LogisticRegression()
X_train_data = merge_train_data.drop('flg', axis=1)
X_test_data = merge_test_data.drop('flg', axis=1)

y_train_data = merge_train_data['flg']
y_test_data = merge_test_data['flg']

from sklearn.decomposition import PCA

best_score = 0
best_num = 0

for num_com in range(8):
    pca = PCA(n_components=num_com+1)
    pca.fit(X_train_data)
    X_train_pca = pca.transform(X_train_data)
    X_test_pca = pca.transform(X_test_data)
    
    logistic_model = model.fit(X_train_pca, y_train_data)
    
    train_score = logistic_model.score(X_train_pca, y_train_data)
    test_score = logistic_model.score(X_test_pca, y_test_data)
    
    if best_score < test_score:
        best_score = test_score
        best_num = num_com+1

print('best score:', best_score)
print('best num components:', best_num)

best score: 0.965034965034965
best num components: 8
