# Overview
- SVMモデルによって分類を試みた。
- nb001, nb002と同様、*Name*、*Ticket*、*Cabin*は、ひとまず特徴量から抜いた。データも欠損値平均補完と、欠損値削除の両方を用意した。
- *svc_1*はlinear SVC: 欠損値平均補完、*svc_2*はlinear SVC: 欠損値削除、*svc_3*はkernel SVC(rbf): 欠損値削除として訓練した。

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
train_data_raw = pd.read_csv('../data/train.csv')
train_data_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
# Passengerid, Name, Ticket, Cabin列を除いた特徴量を取得
train_data = train_data_raw.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Sex, Embarked列をone-hot encordし、それぞれ1列を削除する
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'], drop_first=True)
train_data_columns = train_data.columns.values
train_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.9250,0,0,1
3,1,1,35.0,1,0,53.1000,0,0,1
4,0,3,35.0,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,1,0,1
887,1,1,19.0,0,0,30.0000,0,0,1
888,0,3,,1,2,23.4500,0,0,1
889,1,1,26.0,0,0,30.0000,1,0,0


In [20]:
# 欠損値を平均値で補完する: train_data_imputed
# =============================================
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(train_data)
train_data_imputed = pd.DataFrame(imr.transform(train_data), columns=train_data_columns)

# ラベルを分離する: X_imp, y_imp
X_imp = train_data_imputed.drop(['Survived'], axis=1)
y_imp = train_data_imputed['Survived']
X_imp

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3.0,22.000000,1.0,0.0,7.2500,1.0,0.0,1.0
1,1.0,38.000000,1.0,0.0,71.2833,0.0,0.0,0.0
2,3.0,26.000000,0.0,0.0,7.9250,0.0,0.0,1.0
3,1.0,35.000000,1.0,0.0,53.1000,0.0,0.0,1.0
4,3.0,35.000000,0.0,0.0,8.0500,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
886,2.0,27.000000,0.0,0.0,13.0000,1.0,0.0,1.0
887,1.0,19.000000,0.0,0.0,30.0000,0.0,0.0,1.0
888,3.0,29.699118,1.0,2.0,23.4500,0.0,0.0,1.0
889,1.0,26.000000,0.0,0.0,30.0000,1.0,0.0,0.0


In [28]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split
X_imp_train, X_imp_test, y_imp_train, y_imp_test = train_test_split(X_imp, y_imp, test_size=0.3, random_state=21, stratify=y_imp)

print('Label counts in y_imp: [0 1] =', np.bincount(y_imp))
print('Label counts in y_imp_train: [0 1] =', np.bincount(y_imp_train))
print('Label counts in y_imp_test: [0 1] =', np.bincount(y_imp_test))

# 特徴量を標準化する
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_imp_train)
X_imp_train_std = sc.transform(X_imp_train)
X_imp_test_std = sc.transform(X_imp_test)

Label counts in y_imp: [0 1] = [549 342]
Label counts in y_imp_train: [0 1] = [384 239]
Label counts in y_imp_test: [0 1] = [165 103]


In [23]:
# 欠損値を含む行を削除する: train_data_dropna
# ============================================
train_data_dropna = train_data.dropna(axis=0)

# ラベルを分離する: X_dna, y_dna
X_dna = train_data_dropna.drop(['Survived'], axis=1)
y_dna = train_data_dropna['Survived']
X_dna   # X.shape = (891, 8)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.9250,0,0,1
3,1,35.0,1,0,53.1000,0,0,1
4,3,35.0,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...
885,3,39.0,0,5,29.1250,0,1,0
886,2,27.0,0,0,13.0000,1,0,1
887,1,19.0,0,0,30.0000,0,0,1
889,1,26.0,0,0,30.0000,1,0,0


In [29]:
# 訓練用、テスト用にデータ分割する
from sklearn.model_selection import train_test_split
X_dna_train, X_dna_test, y_dna_train, y_dna_test = train_test_split(X_dna, y_dna, test_size=0.3, random_state=21, stratify=y_dna)

print('Label counts in y_dna: [0 1] =', np.bincount(y_dna))
print('Label counts in y_dna_train: [0 1] =', np.bincount(y_dna_train))
print('Label counts in y_dna_test: [0 1] =', np.bincount(y_dna_test))

# 特徴量を標準化する
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_dna_train)
X_dna_train_std = sc.transform(X_dna_train)
X_dna_test_std = sc.transform(X_dna_test)

Label counts in y_dna: [0 1] = [424 290]
Label counts in y_dna_train: [0 1] = [296 203]
Label counts in y_dna_test: [0 1] = [128  87]


In [35]:
# linear SVCで分類モデル作成
# (svc_1: 欠損値平均補完データ使用)
# ==================================

from sklearn.svm import SVC
svc_1 = SVC(kernel='linear', C=0.1, random_state=21, max_iter=-1)
svc_1.fit(X_imp_train_std, y_imp_train)

# X_imp_test_stdで分類予測
y_pred_1 = svc_1.predict(X_imp_test_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_imp_test, y_pred_1))

Accuracy: 0.776


In [52]:
# linear SVCで分類モデル作成
# (svc_2: 欠損値削除データ使用)
# ==============================

from sklearn.svm import SVC
svc_2 = SVC(kernel='linear', C=0.1, random_state=21, max_iter=-1)
svc_2.fit(X_dna_train_std, y_dna_train)

# X_dna_test_stdで分類予測
y_pred_2 = svc_2.predict(X_dna_test_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_dna_test, y_pred_2))

Accuracy: 0.805


In [79]:
# kernel SVCで分類モデル作成
# (svc_3: 欠損値削除データ使用)
# ==============================

from sklearn.svm import SVC
svc_3 = SVC(kernel='rbf', gamma=0.1, C=1.0, random_state=21, max_iter=-1)
svc_3.fit(X_dna_train_std, y_dna_train)

# X_dna_test_stdで分類予測
y_pred_3 = svc_3.predict(X_dna_test_std)
# 分類の正解率を計算
from sklearn.metrics import accuracy_score
print('Accuracy: %.3f' % accuracy_score(y_dna_test, y_pred_3))

Accuracy: 0.823
