In [1]:
# ライブラリ読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# グラフをノートブックに内に表示するおまじない
%matplotlib inline

In [2]:
# データを読み込み
train = pd.read_csv('train.csv')

# 読み込めたか確認
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 欠損値があるか調べる
train.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [4]:
# 1.Ageの欠損を平均値で埋める
train['Age'].fillna(train['Age'].mean(), inplace=True)

# 2.Cabinを削除する
train.drop('Cabin',  axis=1, inplace = True)

# 3.Embarkedの欠損を一番多い値で埋める
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)

# 4.確認
print(train.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)


In [5]:
# testデータを読み込み
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# 1. 欠損値を平均値で埋める
test['Age'].fillna(test['Age'].mean(), inplace = True)

# 2. Cabin列を削除する
test.drop('Cabin', axis = 1, inplace = True)

# 3. Fareの欠損もあるので、ここも平均値で埋める
test['Fare'].fillna(test['Fare'].mean(), inplace = True)

# 4. 念のためにEmbarkedもチェック！！

# 5. 確認
print(test.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Fare'].fillna(test['Fare'].mean(), inplace = True)


In [7]:
# 性別を数値に変換
train['Sex'] = train['Sex'].apply(lambda x: 0 if str(x).lower() == 'male' else 1)

# 乗船港を数値に変換
train['Embarked'] = train['Embarked'].map({'C': 0, 'Q': 1, 'S': 2 })

# 確認
print(train[['Sex', 'Embarked']].head())

   Sex  Embarked
0    0         2
1    1         0
2    1         2
3    1         2
4    0         2


In [8]:
# 説明変数（特徴量）予想に使うデータ
X = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# 目的変数（正解ラベル）生存か死亡
Y = train['Survived']


In [9]:
# 1. 決定木（DecisionTreeClassfier）を使う。
from sklearn.tree import DecisionTreeClassifier
# 決定木作るよー

# 2. モデル作成
model = DecisionTreeClassifier()

# 3. モデルを学習
model.fit(X, Y)
#このデータ（model.fit(X,Y)）で学習してねーって合図

In [10]:
# 訓練データに対する予測
pred = model.predict(X)

# 正解率を出す
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y, pred)

print(f'正解率: {accuracy:3f}')

# predict(X) = 学習したデータで答えを予測する
# accuracy_acore(正解、予測結果) = どれくらい当たったか計算する

正解率: 0.982043


In [11]:
from sklearn.model_selection import train_test_split

# データ分割
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0)

# train_test_split = データを「ランダムにして」分けてくれる。
# test_size=0.2 = 20％をテスト用(valid)にし、80％を訓練用(train)にするよー。
# random_state=0 = シャッフルの「種」を固定して、再現できるようにする。

In [12]:
# モデル作成(今までと同じDecisionTreeClassifierを使うよー)
model = DecisionTreeClassifier()

# モデル学習(今度はX_trainとY_trainで！！)
model.fit(X_train, Y_train)

In [13]:
# 検証データに対する予測
pred_valid = model.predict(X_valid)

# 正解率を計算
accuracy = accuracy_score(Y_valid, pred_valid)

print(f'検証データの正解率: {accuracy:3f}')

# 今度はX_validから予測するよ！
#そして、Y_validと比較してaccuracyを出す！

検証データの正解率: 0.770950


In [14]:
# データを読み込み
test = pd.read_csv('test.csv')

# 読み込めたか確認
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
# 欠損値があるか確認する
test.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [16]:

# testデータの'Age'と'Fare'の欠損を埋める（安全策でもう一度やる）
test['Age'].fillna(test['Age'].mean(), inplace = True)
test['Fare'].fillna(test['Fare'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].mean(), inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Fare'].fillna(test['Fare'].mean(), inplace = True)


In [17]:
# testデータの'Sex'と'Embarked'も変換する
test['Sex'] = test['Sex'].map({'male' : 0, 'female' : 1})
test['Embarked'] = test['Embarked'].map({'C' : 0, 'Q' : 1, 'S' : 2})

In [18]:
# testデータの特徴量だけ取り出す
X_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

# 予測する
pred_test = model.predict(X_test)

# X_testには、testデータから必要な特徴量だけを取り出すよ！
# そして、model.predict(X_test)で予測するよ！

In [20]:
# PassengerIDと予測結果をセットにする
submisson = pd.DataFrame({
    'PassengerId' : test['PassengerId'],
    'Suvived' : pred_test
})

# CSVに保存する
submisson.to_csv('submission.csv, index = False')

# index = False → 余計な番号列は付けないよ！！
# ファイル名はsubmisson.csvにしてるけど、好きな名前でもOK！！