In [None]:
using CSV
using FileIO, CSVFiles, DataFrames
using Statistics
using Dates

using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split

# ランダムフォレスト
@sk_import ensemble: RandomForestClassifier

# 誤差
@sk_import metrics: (accuracy_score);

### 時間の計測スタート

In [None]:
start = Dates.now()

### データの読込

In [None]:
train = DataFrame(load("train.csv"))
head(train)

In [None]:
size(train)

In [None]:
test = DataFrame(load("test.csv"))
head(test)

In [None]:
size(test)

### 統計量の表示

In [None]:
describe(train)

In [None]:
describe(test)

### 欠損値の処理

In [None]:
# 欠損値のカウント
Dict(zip(names(train), sum.(eachcol(ismissing.(train)))))

In [None]:
Dict(zip(names(test), sum.(eachcol(ismissing.(test)))))

In [None]:
# trainデータのAGE が 欠損値の列を除く
train = train[ismissing.(train)[:, :"Age"] .== 0, :]
head(train)

In [None]:
# testデータのAgeの欠損値を平均値に置き換える
test[:, :Age] = coalesce.(test[:, :Age], mean(test[ismissing.(test)[:, :Age] .== 0, :][:, :Age]))
head(test)

In [None]:
# testデータのFareの欠損値を平均値に置き換える
test[:, :Fare] = coalesce.(test[:, :Fare], mean(test[ismissing.(test)[:, :Fare] .== 0, :][:, :Fare]))
head(test)

### 相関係数

In [None]:
[cor(train[:, i], train[:, "Survived"]) for i in [1, 2, 3, 6, 7, 8, 10]]

### 特徴量の選定

In [None]:
x = Matrix(train[: , [:Pclass, :Age, :SibSp, :Parch, :Fare]])
x[1:6, 1:5]

In [None]:
y = train[:, :Survived]
y[1:6]

In [None]:
test_ft = Matrix(test[: , [:Pclass, :Age, :SibSp, :Parch, :Fare]])
test_ft[1:6, 1:5]

### データの分割

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0, stratify=y);

### 学習モデルの作成

In [None]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
fit!(forest, x_train, y_train)

In [None]:
y_pred = predict(forest, x_test)
y_pred[1:6]

### 誤差

In [None]:
accuracy_score(y_test, y_pred)

### 予測データの作成

In [None]:
pred = predict(forest, test_ft)
pred[1:6]

In [None]:
submission = DataFrame(load("gender_submission.csv"))
submission[:Survived] = pred
submission[1:6, :]

### csvの出力

In [None]:
submission |> CSV.write("submission.csv", delim=',', writeheader=true)

### 経過時間

In [None]:
Dates.now() - start