In [34]:
# 決定木以外の方法でモデルを作ってみる

import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# データの読み込み
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# データの確認
display(train.dtypes)
display(test.dtypes)

# データ全体の確認
display(train.describe())
display(test.describe())

# 欠損データの確認
kesson_train = train.isnull().sum()
kesson_test = test.isnull().sum()
display(kesson_train)
display(kesson_test)



PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [35]:
# 文字列を数値に変換
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2

test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2

In [36]:
# 欠損データを埋める(train.csv)
# 年齢を目的関数として予測モデルを作る
target = 'Age'
FEATURE_COLS = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex']
features = FEATURE_COLS

# ageがあるものを訓練データに、ないものをテストデータにする
age_train_data = train[train[target].notnull()].copy()
age_test_data = train[train[target].isnull()].copy()

# 訓練データから検証データを分割して作成
random_seed = 228
X_train, X_val, y_train, y_val = train_test_split(
    age_train_data[features],
    age_train_data[target],
    test_size=0.2,
    random_state=random_seed
)

# テストデータの説明変数を設定
X_age_test = age_test_data[features]

# パラメータ設定
parameters = {
    "n_estimators" : [2, 5, 10, 15, 20, 30, 50, 75, 100],
    "criterion" : ["squared_error"],
    "min_samples_split" : [2, 3, 5, 10],
    "max_depth" : [2, 3, 5, 10],
    "max_features" : [1, 2, 3, 4, 5],
    "random_state" : [random_seed],
    "verbose" : [False],
}

# ランダムフォレストで予測モデルを作成
age_model = GridSearchCV(RandomForestRegressor(), parameters, cv=3)
age_model.fit(X_train, y_train)
age_model = age_model.best_estimator_

# 検証データで精度を確認
age_model.score(X_val, y_val)
val_score = age_model.score(X_val, y_val)
print(f"Age予測モデルの検証データに対する精度: {val_score}")

predicted_train_ages = age_model.predict(X_age_test)

# 予測した年齢で欠損値を埋める
train.loc[(train['Age'].isnull()), 'Age'] = predicted_train_ages

Age予測モデルの検証データに対する精度: 0.2576451600482287


In [15]:
# 欠損データを埋める(test.csv)
# Fareの欠損値を中央値で埋める
test['Fare'].fillna(test['Fare'].median())

# 作成した予測モデルでAgeの欠損値を埋める
test_age_test_data = test[test['Age'].isnull()].copy()
X_test_age_test = test_age_test_data[features]

predicted_test_ages = age_model.predict(X_test_age_test)

test.loc[(test['Age'].isnull()), 'Age'] = predicted_test_ages

In [17]:
# Survivedを目的変数、その他を説明変数として設定
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

X_train = train[features]
y_train = train[target]
X_test = test[features]

# ランダムフォレストで予測モデルを作成
model = RandomForestClassifier(n_estimators=100, max_features='sqrt', random_state=228)
model.fit(X_train, y_train)
predicted = model.predict(X_test)

# 結果の保存
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(data=predicted, index=PassengerId, columns = ["Survived"])
my_solution.to_csv("titanic_RandomForest_solution.csv", index_label = ["PassengerId"])