# Overview
- パーセプトロンモデルによって分類を試みた。
- *Name*、*Ticket*、*Cabin*は、ひとまず特徴量から抜いた。

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [9]:
train_data = pd.read_csv('data/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [80]:
# Passengerid, Name, Ticket, Cabin列を除いた特徴量を取得
X = train_data.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Sex, Embarked列をone-hot encordし、それぞれ1列を削除する
X = pd.get_dummies(X)
X = X.drop(['Sex_male', 'Embarked_S'], axis=1)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Embarked_C,Embarked_Q
0,3,22.0,1,0,7.2500,0,0,0
1,1,38.0,1,0,71.2833,1,1,0
2,3,26.0,0,0,7.9250,1,0,0
3,1,35.0,1,0,53.1000,1,0,0
4,3,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,0
887,1,19.0,0,0,30.0000,1,0,0
888,3,,1,2,23.4500,1,0,0
889,1,26.0,0,0,30.0000,0,1,0


In [81]:
y = train_data['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [76]:
# 欠損値を平均値で補完する
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(X)
X_imputed = imr.transform(X)
X_imputed.shape

(891, 8)

In [95]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=21, stratify=y)
print('Label counts in y: [0 1] =', np.bincount(y))
print('Label counts in y_train: [0 1] =', np.bincount(y_train))
print('Label counts in y_test: [0 1] =', np.bincount(y_test))

Label counts in y: [0 1] = [549 342]
Label counts in y_train: [0 1] = [384 239]
Label counts in y_test: [0 1] = [165 103]


In [96]:
# 特徴量を標準化する
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

X_train_std[0:5]

array([[ 0.84227343, -1.34363494,  0.48796888, -0.49461107, -0.42174203,
        -0.75188206,  2.07961206, -0.30806386],
       [-0.35575476,  0.22628875, -0.46802047,  2.18489066, -0.12720514,
         1.32999582, -0.48085892, -0.30806386],
       [-0.35575476,  0.67483838, -0.46802047, -0.49461107, -0.12720514,
        -0.75188206, -0.48085892, -0.30806386],
       [-1.55378295,  1.57193763, -0.46802047, -0.49461107, -0.11622859,
        -0.75188206, -0.48085892, -0.30806386],
       [-1.55378295, -0.0204795 ,  0.48796888, -0.49461107,  0.39168654,
         1.32999582, -0.48085892, -0.30806386]])

In [97]:
# パーセプトロンで分類モデル作成
# ===============================

from sklearn.linear_model import Perceptron
ppn = Perceptron(eta0=0.01, random_state=21)
ppn.fit(X_train_std, y_train)

# X_testで分類予測
y_pred = ppn.predict(X_test_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.746
