In [20]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')


In [21]:
trainingData = pd.read_csv('./data/train.csv')
testData = pd.read_csv('./data/test.csv')


In [22]:
from collections import Counter
import re

# trainingDataのコピーを作成 コピーした奴は、中身をいじる
trainingDataCopy = trainingData.copy()


def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""


# 敬称を新たな列として追加
trainingDataCopy['Title'] = trainingDataCopy['Name'].apply(get_title)
# trainingDataCopy_test['Title'] = trainingDataCopy_test['Name'].apply(get_title)

titles = list(trainingDataCopy['Title'])
# title_namesのリストの中にある敬称を持つ人の平均年齢を算出し、dictに格納　
title_ages = {}
for title in titles:
    title_ages[title] = trainingDataCopy[trainingDataCopy['Title']
                                         == title]['Age'].mean()

# リスト2つからdictを作成

# 年齢が欠損しているレコードを、その人の敬称に対応する平均年齢で補完
trainingDataCopy.loc[trainingDataCopy['Age'].isnull(), 'Age'] = trainingDataCopy[trainingData['Age'].isnull()].apply(
    lambda row: title_ages[row['Title']], axis=1)
# trainingDataCopy_test.loc[trainingDataCopy_test['Age'].isnull(), 'Age'] = trainingDataCopy_test[trainingDataCopy_test['Age'].isnull()].apply(
#     lambda row: title_ages[row['Title']], axis=1)


In [23]:
trainingDataCopy.drop('Cabin', axis=1, inplace=True)
trainingDataCopy['Embarked'].fillna('S', inplace=True)

trainingDataCopy.isnull().sum()


PassengerId    0
Perished       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Title          0
dtype: int64

In [24]:
trainingDataCopy.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

embarked_ohe = trainingDataCopy['Embarked']

embarked_ohe = pd.get_dummies(embarked_ohe)

trainingDataCopy = pd.concat([trainingDataCopy, embarked_ohe], axis=1)

trainingDataCopy.drop('Embarked', axis=1, inplace=True)

trainingDataCopy.head()


Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Title,C,Q,S
0,1,1,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,Mr,0,0,1
1,2,0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,Mrs,1,0,0
2,3,0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,Miss,0,0,1
3,4,0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,Mrs,0,0,1
4,5,1,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,Mr,0,0,1


In [25]:
trainingDataCopy.drop(
    ['Name', 'Ticket', 'Fare', 'Title'], axis=1, inplace=True)

X = trainingDataCopy.iloc[:, 2:].values
y = trainingDataCopy.iloc[:, 1].values


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


param_grid = {'max_depth': [3, 5, 6, 7, 8, 9],
              'min_samples_leaf': [1, 2, 3, 4, 5]}


# param_grid = {'max_depth': [3, 5, 7],
            #   'min_samples_leaf': [1, 2, 4]}

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42)


rfc_gs = GridSearchCV(RandomForestClassifier(
    n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
rfc_gs.fit(X, y)

print('Best Parameters: {}'.format(rfc_gs.best_params_))
print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))


Best Parameters: {'max_depth': 6, 'min_samples_leaf': 1}
CV Score: 0.828


In [27]:
from sklearn.neural_network import MLPClassifier


mlpc = MLPClassifier(hidden_layer_sizes=(100, 100, 10), random_state=0)
mlpc.fit(X_train, y_train)

print('Multilayer Perceptron \n')
print('Train Score: {}'.format(round(mlpc.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(mlpc.score(X_valid, y_valid), 3)))


In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

print('Logistic Regression \n')
print('Train Score: {}'.format(round(lr.score(X_train, y_train), 3)))
print(' Test Score: {}'.format(round(lr.score(X_valid, y_valid), 3)))


In [None]:
rfc_pred = rfc.predict_proba(X_test)
lr_pred = lr.predict_proba(X_test)
mlpc_pred = mlpc.predict_proba(X_test)

pred_proba = (rfc_pred + lr_pred + mlpc_pred) / 3
pred = pred_proba.argmax(axis=1)
