# 機械学習をPythonで実践する-5　　～ ロジスティック回帰 ～

In [60]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import product
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
# from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# import statsmodels.api as sma
from sklearn.model_selection import train_test_split ,LeaveOneOut, cross_val_score, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, log_loss
# from sklearn.pipeline import Pipeline
# from sklearn.neighbors import KNeighborsRegressor


%matplotlib inline
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## ○ ロジスティック回帰で分類器を学習して評価する
* sklearn.linear_model.LogisticRegression　　※ロジスティック回帰は線形モデルの一種
    * 使い方は他のモデル同様
    * 引数
        * penalty: ‘l1’, ‘l2’, ‘elasAcnet’, ‘none’  →正則化項
        * solver: ‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga'  →最適化アルゴリズム
        * mulA_class: ‘ovr’, ‘auto’, ‘mulAnomial’  →多クラス分類用の引数
        * .predict(X)でラベル(クラス)の分類結果を取得
        * .predict_proba(X)でラベル(クラス)の確率$p(X)$を取得
* logloss
    * sklearn.metrics.log_loss
    * log_loss(y_true, y_pred_proba)
    * y_pred_proba には.predict_proba(X)の戻り値、つまり確率を入れる

In [8]:
# タイタニックのdataset読み込み。
df = sns.load_dataset('titanic')

In [9]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


タイタニック号沈没を題材に、乗客が生存したかどうかのデータセット。※実際のタイタニック号のデータではない

In [10]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


ageに欠損値(NaN)がある模様

In [16]:
# NaNを全て落とす
df = df.dropna()
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [29]:
# 'survived'および'alive'を目的変数とするので、これらを除いたものを特徴量とする
X = df.loc[: , (df.columns != 'survived') & (df.columns != 'alive')]
X

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,False
3,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,False
6,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,True
10,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,False
11,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,False
872,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,True
879,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,False
887,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,True


In [30]:
# 質的変数をダミー変数にする
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,embarked_S,...,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton
1,1,38.0,1,0,71.2833,False,False,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,1,35.0,1,0,53.1000,False,False,0,0,1,...,0,1,0,1,0,0,0,0,0,1
6,1,54.0,0,0,51.8625,True,True,1,0,1,...,1,0,0,0,0,1,0,0,0,1
10,3,4.0,1,1,16.7000,False,False,0,0,1,...,0,0,0,0,0,0,0,1,0,1
11,1,58.0,0,0,26.5500,False,True,0,0,1,...,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,47.0,1,1,52.5542,False,False,0,0,1,...,0,1,0,0,1,0,0,0,0,1
872,1,33.0,0,0,5.0000,True,True,1,0,1,...,1,0,1,0,0,0,0,0,0,1
879,1,56.0,0,1,83.1583,False,False,0,0,0,...,0,1,0,1,0,0,0,0,0,0
887,1,19.0,0,0,30.0000,False,True,0,0,1,...,0,1,1,0,0,0,0,0,0,1


In [32]:
# 目的変数を作成
y = df['survived']
y

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: survived, Length: 182, dtype: int64

In [35]:
# 0,1以外の値が入っていないか確認
y.unique()

array([1, 0])

In [37]:
# hold-outで学習データとテストデータを分ける
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(len(X_train), len(X_test))

127 55


In [39]:
# ロジスティック回帰モデルのインスタンス生成と学習
model = LogisticRegression()
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


デフォルトのsolverはlbfgsであり、収束しなかったという警告が出ている。  
収束させるにはイテレーションを増やしたりする。ある程度は最適解に近づいていると考えて、今回はそのまま進める。

In [56]:
# パラメタの確認
model.coef_

array([[-0.29620396, -0.02131131,  0.62294616, -0.3739691 ,  0.00478609,
        -0.88715882,  0.17502377,  0.38867403, -0.4046351 , -0.02314913,
        -0.45329361, -0.4048151 , -0.88715882,  1.21203173, -0.11131045,
        -1.20769917, -0.14170395,  0.61111292, -0.13073247, -0.55173314,
        -0.4046351 , -0.02314913]])

In [57]:
# バイアス項を確認
model.intercept_

array([2.14355034])

In [59]:
# 特徴量の名前を確認。coef_と合わせるとどの特徴量に対応するパラメタなのか確認するのに便利
model.feature_names_in_

array(['pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male', 'alone',
       'sex_male', 'embarked_Q', 'embarked_S', 'class_Second',
       'class_Third', 'who_man', 'who_woman', 'deck_B', 'deck_C',
       'deck_D', 'deck_E', 'deck_F', 'deck_G', 'embark_town_Queenstown',
       'embark_town_Southampton'], dtype=object)

In [53]:
# テストデータに対して予測。（確率を計算した後、２値のどちら寄りかが返ってくる）
y_pred = model.predict(X_test)
y_pred

array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1])

In [49]:
X_test.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone,sex_male,embarked_Q,embarked_S,...,who_man,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton
484,1,25.0,1,0,91.0792,True,False,1,0,0,...,1,0,1,0,0,0,0,0,0,0
110,1,47.0,0,0,52.0,True,True,1,0,1,...,1,0,0,1,0,0,0,0,0,1
195,1,58.0,0,0,146.5208,False,True,0,0,0,...,0,1,1,0,0,0,0,0,0,0
496,1,54.0,1,0,78.2667,False,False,0,0,0,...,0,1,0,0,1,0,0,0,0,0
889,1,26.0,0,0,30.0,True,True,1,0,0,...,1,0,0,1,0,0,0,0,0,0


In [50]:
# 正解を見てみる
y

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: survived, Length: 182, dtype: int64

割とあっていそうな感じはある。

In [51]:
# p(X_test)の値（確率）を見てみる
# ０列目が0,１列目が1の確率
y_pred_proba = model.predict_proba(X_test)
y_pred_proba

array([[0.29391013, 0.70608987],
       [0.79738438, 0.20261562],
       [0.06989546, 0.93010454],
       [0.0592785 , 0.9407215 ],
       [0.72740262, 0.27259738],
       [0.55988935, 0.44011065],
       [0.23461532, 0.76538468],
       [0.11840954, 0.88159046],
       [0.02387534, 0.97612466],
       [0.05005434, 0.94994566],
       [0.72374981, 0.27625019],
       [0.05473768, 0.94526232],
       [0.08546959, 0.91453041],
       [0.05005434, 0.94994566],
       [0.50642995, 0.49357005],
       [0.20945738, 0.79054262],
       [0.48461463, 0.51538537],
       [0.12854399, 0.87145601],
       [0.19895646, 0.80104354],
       [0.64540661, 0.35459339],
       [0.13486385, 0.86513615],
       [0.17325155, 0.82674845],
       [0.05864188, 0.94135812],
       [0.14777465, 0.85222535],
       [0.29495916, 0.70504084],
       [0.77584395, 0.22415605],
       [0.51415213, 0.48584787],
       [0.33575284, 0.66424716],
       [0.36620834, 0.63379166],
       [0.67718045, 0.32281955],
       [0.

In [55]:
# Log Lossで評価。
log_loss(y_test, y_pred_proba)

0.411158697819259

ロジスティック回帰の精度指標としてはLogLossはあまり使わない。この数値の解釈については後述。  
また、今回は特徴量が多く多次元なので決定境界は簡単に視覚化できない。  