# Titanic: Machine Learning from Disaster

In [1]:
import numpy as np  
import pandas as pd  
from sklearn import tree

# 基本の描画ライブラリ（２つ）
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 便利な設定

# pandasで全ての列を表示
pd.options.display.max_columns = None

# 図をipython notebook内で表示
%matplotlib inline

# DeplicatedWarningを避けるため
import warnings
#warnings.filterwarnings("ignore", category=DeprecationWarning)

## データの読み込み

In [13]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

## データ整形

目的：データの特徴を理解し、機械学習ができる形に整形する

In [14]:
df_train["Sex"][df_train["Sex"] == "male"] = 0
df_train["Sex"][df_train["Sex"] == "female"] = 1
### Sexを0と1のデータに変換

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [15]:
df_train["Embarked"] = df_train["Embarked"].fillna("S")
# NaN値のところはfillnaメソッドを使ってSで埋める

df_train["Embarked"][df_train["Embarked"] == "S"] = 0
df_train["Embarked"][df_train["Embarked"] == "C"] = 1
df_train["Embarked"][df_train["Embarked"] == "Q"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### 説明変数と目的変数を作成

In [16]:
target = df_train["Survived"].values
# 目的変数

In [17]:
features_one = df_train[["Sex", "Embarked"]].values
# 説明変数（使う変数をデータフレームの中からカラム名で指定する）

In [18]:
my_tree_one = tree.DecisionTreeClassifier()
# DecisionTreeClassifierを初期化

In [19]:
my_tree_one = my_tree_one.fit(features_one, target)
# 説明変数と目的変数をnumpy.arrayで投げる
# 質問：clf.fit(X,Y)これってやっぱり説明変数がＸで目的変数がＹ？逆じゃだめ？

In [20]:
my_tree_one.feature_importances_

array([ 0.94313207,  0.05686793])

In [21]:
my_tree_one.score(features_one, target)

0.78675645342312006

### 正答率78%（´・ω・｀）でも前の分類器（76%）よりはいいのでKaggleにアップロードしてみる

### テストデータを使ってPredictしたデータで提出用のcsvファイルを作成

In [22]:
df_test["Sex"][df_test["Sex"] == "male"] = 0
df_test["Sex"][df_test["Sex"] == "female"] = 1
### Sexを0と1のデータに変換

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [23]:
df_test["Embarked"] = df_test["Embarked"].fillna("S")
# NaN値のところはfillnaメソッドを使ってSで埋める

df_test["Embarked"][df_test["Embarked"] == "S"] = 0
df_test["Embarked"][df_test["Embarked"] == "C"] = 1
df_test["Embarked"][df_test["Embarked"] == "Q"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [25]:
# Extract the features from the test set: Pclass and Sex.
test_features = df_test[["Pclass", "Sex"]].values
test_features

array([[3, 0],
       [3, 1],
       [2, 0],
       [3, 0],
       [3, 1],
       [3, 0],
       [3, 1],
       [2, 0],
       [3, 1],
       [3, 0],
       [3, 0],
       [1, 0],
       [1, 1],
       [2, 0],
       [1, 1],
       [2, 1],
       [2, 0],
       [3, 0],
       [3, 1],
       [3, 1],
       [1, 0],
       [3, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [3, 0],
       [1, 1],
       [3, 0],
       [1, 0],
       [3, 0],
       [2, 0],
       [2, 0],
       [3, 1],
       [3, 1],
       [1, 0],
       [3, 0],
       [3, 1],
       [3, 1],
       [3, 0],
       [3, 0],
       [3, 0],
       [1, 0],
       [3, 0],
       [2, 1],
       [1, 1],
       [3, 0],
       [1, 0],
       [3, 0],
       [1, 1],
       [3, 1],
       [1, 0],
       [2, 0],
       [2, 1],
       [1, 1],
       [2, 0],
       [3, 0],
       [3, 0],
       [3, 0],
       [3, 0],
       [1, 1],
       [3, 0],
       [2, 0],
       [3, 0],
       [3, 1],
       [1, 0],
       [2, 1],
       [3,

In [26]:
# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)
my_prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1,

# なんでSurvivedが全部１なん・・・・・

In [18]:
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId =np.array(df_test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])

In [19]:
# Check that your data frame has 418 entries
print(my_solution.shape)

(418, 1)


In [20]:
my_solution
# なんでSurvivedが全部１なん・・・・・

Unnamed: 0,Survived
892,1
893,1
894,1
895,1
896,1
897,1
898,1
899,1
900,1
901,1


In [39]:
# Output submmision data
# my_solution.to_csv("output/submission02.csv", index_label = ["PassengerId"])