# Overview
- ロジスティック回帰モデルによって分類を試みた。
- nb001と同様、*Name*、*Ticket*、*Cabin*は、ひとまず特徴量から抜いた。
- *lr_1*はnb001と同様に欠損値を平均で補完、*lr_2*は欠損値を持つデータ行を削除して訓練した。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('../data/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Passengerid, Name, Ticket, Cabin列を除いた特徴量を取得
X = train_data.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Sex, Embarked列をone-hot encordし、それぞれ1列を削除する
X = pd.get_dummies(X)
X = X.drop(['Sex_male', 'Embarked_S'], axis=1)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q
0,3,22.0,1,0,7.2500,0,0,0
1,1,38.0,1,0,71.2833,1,1,0
2,3,26.0,0,0,7.9250,1,0,0
3,1,35.0,1,0,53.1000,1,0,0
4,3,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,0
887,1,19.0,0,0,30.0000,1,0,0
888,3,,1,2,23.4500,1,0,0
889,1,26.0,0,0,30.0000,0,1,0


In [40]:
y = train_data['Survived']
y = pd.DataFrame(y)
y

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [39]:
# 欠損値を平均値で補完する
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(X)
X_imputed = imr.transform(X)
X_imputed.shape
#X_imputed[888, 1]
#np.mean(X_imputed[:, 1])

(891, 8)

In [27]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_imputed, y, test_size=0.3, random_state=21, stratify=y)
print('Label counts in y: [0 1] =', np.bincount(y))
print('Label counts in y_train: [0 1] =', np.bincount(y_train_1))
print('Label counts in y_test: [0 1] =', np.bincount(y_test_1))

Label counts in y: [0 1] = [549 342]
Label counts in y_train: [0 1] = [384 239]
Label counts in y_test: [0 1] = [165 103]


In [28]:
# 特徴量を標準化する
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train_1)
X_train_1_std = sc.transform(X_train_1)
X_test_1_std = sc.transform(X_test_1)

X_train_1_std[0:5]

array([[ 0.84227343, -1.34363494,  0.48796888, -0.49461107, -0.42174203,
        -0.75188206,  2.07961206, -0.30806386],
       [-0.35575476,  0.22628875, -0.46802047,  2.18489066, -0.12720514,
         1.32999582, -0.48085892, -0.30806386],
       [-0.35575476,  0.67483838, -0.46802047, -0.49461107, -0.12720514,
        -0.75188206, -0.48085892, -0.30806386],
       [-1.55378295,  1.57193763, -0.46802047, -0.49461107, -0.11622859,
        -0.75188206, -0.48085892, -0.30806386],
       [-1.55378295, -0.0204795 ,  0.48796888, -0.49461107,  0.39168654,
         1.32999582, -0.48085892, -0.30806386]])

In [95]:
# ロジスティック回帰で分類モデル作成
# (lr_1: 欠損値データ平均補完ver.)
# ==================================

from sklearn.linear_model import LogisticRegression
lr_1 = LogisticRegression(C=0.1, random_state=21, solver='lbfgs', max_iter=100, multi_class='auto')
lr_1.fit(X_train_1_std, y_train_1)

# X_testで分類予測
y_pred_1 = lr_1.predict(X_test_1_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_test_1, y_pred_1))

Accuracy: 0.799


In [86]:
# 欠損値を含む行を削除する
X_y = X.join(y)
X_y_dropna = X_y.dropna(axis=0)
X_dropna = X_y_dropna.drop('Survived', axis=1)
y_dropna = X_y_dropna['Survived']
X_dropna.shape   # X.shape = (891, 8)

(714, 8)

In [89]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_dropna, y_dropna, test_size=0.3, random_state=21, stratify=y_dropna)
X_train_2

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q
594,2,37.0,1,0,26.0000,0,0,0
111,3,14.5,1,0,14.4542,1,1,0
784,3,25.0,0,0,7.0500,0,0,0
642,3,2.0,3,2,27.9000,1,0,0
865,2,42.0,0,0,13.0000,1,0,0
...,...,...,...,...,...,...,...,...
733,2,23.0,0,0,13.0000,0,0,0
487,1,58.0,0,0,29.7000,0,1,0
449,1,52.0,0,0,30.5000,0,0,0
668,3,43.0,0,0,8.0500,0,0,0


In [90]:
# 特徴量を標準化する
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train_2)
X_train_2_std = sc.transform(X_train_2)
X_test_2_std = sc.transform(X_test_2)

X_train_2_std[0:5]

array([[-0.28071855,  0.47216654,  0.45691925, -0.52147681, -0.15873424,
        -0.77749701, -0.47860635, -0.19344776],
       [ 0.90638785, -1.05906166,  0.45691925, -0.52147681, -0.38113078,
         1.28617857,  2.08939977, -0.19344776],
       [ 0.90638785, -0.3444885 , -0.56551441, -0.52147681, -0.52375135,
        -0.77749701, -0.47860635, -0.19344776],
       [ 0.90638785, -1.909744  ,  2.50178656,  1.77118335, -0.12213622,
         1.28617857, -0.47860635, -0.19344776],
       [-0.28071855,  0.81243948, -0.56551441, -0.52147681, -0.40914175,
         1.28617857, -0.47860635, -0.19344776]])

In [94]:
# ロジスティック回帰で分類モデル作成
# (lr_2: 欠損値データ削除ver.)
# ==================================

from sklearn.linear_model import LogisticRegression
lr_2 = LogisticRegression(C=0.1, random_state=21, solver='lbfgs', max_iter=100, multi_class='auto')
lr_2.fit(X_train_1_std, y_train_1)

# X_testで分類予測
y_pred_2 = lr_2.predict(X_test_2_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_test_2, y_pred_2))

Accuracy: 0.837
