In [10]:
import pandas as pd
import numpy as np

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [11]:
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)
train["Sex"].replace({"male":0, "female":1}, inplace=True)
train["Embarked"].replace({"S":0, "C":1, "Q":2}, inplace=True)
test["Age"].fillna(test["Age"].median(), inplace=True)
test["Embarked"].fillna(test["Embarked"].mode()[0], inplace=True)
test["Fare"].fillna(test["Fare"].median(), inplace=True)
test["Sex"].replace({"male": 0, "female": 1}, inplace=True)
test["Embarked"].replace({"S": 0, "C": 1, "Q": 2}, inplace=True)

In [12]:
# 最適な木の深さを探索する
from sklearn import tree

dependent_variables = train["Survived"].values
independent_variables = train[["Pclass", "Sex", "Age", "Fare"]].values

array_length = len(independent_variables)
for i in range(1, 10):
    score_to_mean = 0
    for j in range(0, 5):
        test_dependent_variables = dependent_variables[int(array_length * j / 5):int(array_length * (j + 1) / 5)]
        test_independent_variables = independent_variables[int(array_length * j / 5):int(array_length * (j + 1) / 5)]

        train_dependent_variables = np.concatenate([dependent_variables[0:int(array_length * j / 5)], dependent_variables[int(array_length * (j + 1) / 5):array_length]])
        train_independent_variables = np.concatenate([independent_variables[0:int(
            array_length * j / 5)], independent_variables[int(array_length * (j + 1) / 5):array_length]])


        tree_model = tree.DecisionTreeClassifier(max_depth=i)
        tree_model.fit(train_independent_variables, train_dependent_variables)
        
        prediction = tree_model.predict(test_independent_variables)

        score_to_mean += np.count_nonzero(prediction == test_dependent_variables,
                                          axis=0)/len(test_dependent_variables)

    print("max_depth:", i, "=>", "score:", score_to_mean * 100/5)



max_depth: 1 => score: 78.67428284476806
max_depth: 2 => score: 76.8765300357793
max_depth: 3 => score: 80.46889711882493
max_depth: 4 => score: 80.58000125541398
max_depth: 5 => score: 81.47573912497646
max_depth: 6 => score: 81.70045822610004
max_depth: 7 => score: 81.81407319063462
max_depth: 8 => score: 80.35402674031761
max_depth: 9 => score: 80.1305630531668


In [13]:
# 深さ7の木で学習する
# score: 0.77272
decision_tree = tree.DecisionTreeClassifier(max_depth=7).fit(
    independent_variables, dependent_variables)

result_independent_variables = test[["Pclass", "Sex", "Age", "Fare"]].values
prediction = decision_tree.predict(result_independent_variables)
PassengerId = np.array(test["PassengerId"])
result = pd.DataFrame({"PassengerId": PassengerId, "Survived": prediction})
result.to_csv("../output/result_tree.csv", index=False)


In [14]:
# 引数を変えて算出してみる
import itertools
from sklearn import tree
max = 0
max_tuple = ()
max_learning = 0

dependent_variables = train["Survived"].values
variables = ["Pclass", "Age",
             "Sex", "Fare", "SibSp", "Parch", "Embarked"]
# independent_variables = train[["Pclass", "Sex", "Age", "Fare"]].values
for n in range(1, len(variables)+1):
  for variable_tapples in itertools.combinations(variables, n):

    independent_variables = train[list(variable_tapples)].values
    array_length = len(independent_variables)

    for i in range(1, 10):
      score_to_mean = 0
      for j in range(0, 5):
          test_dependent_variables = dependent_variables[int(
              array_length * j / 5):int(array_length * (j + 1) / 5)]
          test_independent_variables = independent_variables[int(
              array_length * j / 5):int(array_length * (j + 1) / 5)]

          train_dependent_variables = np.concatenate([dependent_variables[0:int(
              array_length * j / 5)], dependent_variables[int(array_length * (j + 1) / 5):array_length]])
          train_independent_variables = np.concatenate([independent_variables[0:int(
              array_length * j / 5)], independent_variables[int(array_length * (j + 1) / 5):array_length]])

          tree_model = tree.DecisionTreeClassifier(max_depth=i)
          tree_model.fit(train_independent_variables, train_dependent_variables)

          prediction = tree_model.predict(test_independent_variables)

          score_to_mean += np.count_nonzero(prediction == test_dependent_variables,
                                            axis=0)/len(test_dependent_variables)

      if max < score_to_mean * 100/5:
        max = score_to_mean * 100/5
        max_tuple = variable_tapples
        max_learning = i

print("max:", max, "max_tuple:", max_tuple, "max_learning:", max_learning)

max: 82.37461552947084 max_tuple: ('Pclass', 'Age', 'Sex', 'SibSp', 'Embarked') max_learning: 7


In [15]:
# 学習する
#　大きくscoreが下がったので最適なパラメータの探索はもう少し工夫が必要だ
# score: 0.74401

dependent_variables = train["Survived"].values
independent_variables = train[[
    'Pclass', 'Age', 'Sex', 'SibSp', 'Embarked']].values

decision_tree = tree.DecisionTreeClassifier(max_depth=7).fit(
    independent_variables, dependent_variables)

result_independent_variables = test[[
    'Pclass', 'Age', 'Sex', 'SibSp', 'Embarked']].values
prediction = decision_tree.predict(result_independent_variables)
PassengerId = np.array(test["PassengerId"])
result = pd.DataFrame({"PassengerId": PassengerId, "Survived": prediction})
result.to_csv("../output/result_tree.csv", index=False)


In [16]:
# 引数の問題はランダムフォレストで解決しよう！
# 学習する
#　大きくscoreが下がったので最適なパラメータの探索はもう少し工夫が必要だ
# 深さ 7 と 5 で比べると 5 の方が良い
# 上記のような引数の決め方ではよくないということがわかった
# →引数は多く入れた方がいいみたい(選択されるから)
# きの深さは設定した方がいいが、5と7では0.05程度の差(設定しなければ0.04)
# score: 0.78708
from sklearn.ensemble import RandomForestClassifier

dependent_variables = train["Survived"].values
independent_variables = train[["Pclass", "Age",
                              "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values

model = RandomForestClassifier(max_depth=5).fit(
    independent_variables, dependent_variables)

result_independent_variables = test[["Pclass", "Age",
                                    "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
prediction = model.predict(result_independent_variables)
PassengerId = np.array(test["PassengerId"])
result = pd.DataFrame({"PassengerId": PassengerId, "Survived": prediction})
result.to_csv("../output/result_tree.csv", index=False)


In [79]:
train['Name'].str.split(' ', n=2).str.get(2)
train['family_name'] = train['Name'].str.replace('Mr. ', '').str.replace('Mrs. ', '').str.replace('Mr. ', '').str.replace(
    'Miss. ', '').str.replace('Master. ', '').str.replace('Dr. ', '').str.replace('Rev. ', '').str.split(' ', n=2).str.get(1)


  train['family_name'] = train['Name'].str.replace('Mr. ', '').str.replace('Mrs. ', '').str.replace('Mr. ', '').str.replace(


In [94]:
train['honorific'] = train['Name'].str.split(', ', n=1).str.get(1).str.split(
    ' ', n=1).str.get(0)
test['honorific'] = test['Name'].str.split(', ', n=1).str.get(1).str.split(
    ' ', n=1).str.get(0)


In [95]:
from sklearn.ensemble import RandomForestClassifier

dependent_variables = train["Survived"].values
independent_variables = train[["Pclass", "Age",
                              "Sex", "Fare", "SibSp", "Parch", "Embarked", "honorific"]].values

model = RandomForestClassifier(max_depth=5).fit(
    independent_variables, dependent_variables)

result_independent_variables = test[["Pclass", "Age",
                                    "Sex", "Fare", "SibSp", "Parch", "Embarked", "honorific"]].values
prediction = model.predict(result_independent_variables)
PassengerId = np.array(test["PassengerId"])
result = pd.DataFrame({"PassengerId": PassengerId, "Survived": prediction})
result.to_csv("../output/result_tree.csv", index=False)


ValueError: could not convert string to float: 'Mr.'