In [1]:
using CSV
using FileIO, CSVFiles, DataFrames
using Statistics
using Dates

using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split

# ランダムフォレスト
@sk_import ensemble: RandomForestClassifier

# 誤差
@sk_import metrics: (accuracy_score);

### 時間の計測スタート

In [2]:
start = Dates.now()

2020-08-08T16:35:02.641

### データの読込

In [3]:
train = DataFrame(load("train.csv"))
head(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String,Float64?,Int64,Int64,String,Float64,String,String
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,6,0,3,"Moran, Mr. James",male,missing,0,0,330877,8.4583,,Q


In [4]:
size(train)

(891, 12)

In [5]:
test = DataFrame(load("test.csv"))
head(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,String,String,Float64?,Int64,Int64,String,Float64?,String,String
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S


In [6]:
size(test)

(418, 11)

### 統計量の表示

In [7]:
describe(train)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Union…,Type
1,PassengerId,446.0,1,446.0,891,,,Int64
2,Survived,0.383838,0,0.0,1,,,Int64
3,Pclass,2.30864,1,3.0,3,,,Int64
4,Name,,"Abbing, Mr. Anthony",,"van Melkebeke, Mr. Philemon",891.0,,String
5,Sex,,female,,male,2.0,,String
6,Age,29.6991,0.42,28.0,80.0,,177.0,"Union{Missing, Float64}"
7,SibSp,0.523008,0,0.0,8,,,Int64
8,Parch,0.381594,0,0.0,6,,,Int64
9,Ticket,,110152,,WE/P 5735,681.0,,String
10,Fare,32.2042,0.0,14.4542,512.329,,,Float64


In [8]:
describe(test)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Union…,Type
1,PassengerId,1100.5,892,1100.5,1309,,,Int64
2,Pclass,2.26555,1,3.0,3,,,Int64
3,Name,,"Abbott, Master. Eugene Joseph",,"van Billiard, Master. Walter John",418.0,,String
4,Sex,,female,,male,2.0,,String
5,Age,30.2726,0.17,27.0,76.0,,86.0,"Union{Missing, Float64}"
6,SibSp,0.447368,0,0.0,8,,,Int64
7,Parch,0.392344,0,0.0,9,,,Int64
8,Ticket,,110469,,W.E.P. 5734,363.0,,String
9,Fare,35.6272,0.0,14.4542,512.329,,1.0,"Union{Missing, Float64}"
10,Cabin,,,,G6,77.0,,String


### 欠損値の処理

In [9]:
# 欠損値のカウント
Dict(zip(names(train), sum.(eachcol(ismissing.(train)))))

Dict{String,Int64} with 12 entries:
  "Ticket"      => 0
  "Age"         => 177
  "PassengerId" => 0
  "Cabin"       => 0
  "Name"        => 0
  "Sex"         => 0
  "Pclass"      => 0
  "SibSp"       => 0
  "Survived"    => 0
  "Embarked"    => 0
  "Fare"        => 0
  "Parch"       => 0

In [10]:
Dict(zip(names(test), sum.(eachcol(ismissing.(test)))))

Dict{String,Int64} with 11 entries:
  "Ticket"      => 0
  "Age"         => 86
  "PassengerId" => 0
  "Cabin"       => 0
  "Name"        => 0
  "Sex"         => 0
  "Pclass"      => 0
  "SibSp"       => 0
  "Embarked"    => 0
  "Fare"        => 1
  "Parch"       => 0

In [11]:
# trainデータの Age の欠損値のある行を除く
train = train[ismissing.(train)[:, :"Age"] .== 0, :]
head(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String,Float64?,Int64,Int64,String,Float64,String,String
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [12]:
# testデータの Age の欠損値を平均値に置き換える
test[:, :Age] = coalesce.(test[:, :Age], mean(test[ismissing.(test)[:, :Age] .== 0, :][:, :Age]))
head(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,String,String,Float64?,Int64,Int64,String,Float64?,String,String
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S


In [13]:
# testデータのFareの欠損値を平均値に置き換える
test[:, :Fare] = coalesce.(test[:, :Fare], mean(test[ismissing.(test)[:, :Fare] .== 0, :][:, :Fare]))
head(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,String,String,Float64?,Int64,Int64,String,Float64?,String,String
1,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
2,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
3,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
4,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
5,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
6,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S


### 相関係数の表示

In [28]:
[cor(train[:, i], train[:, "Survived"]) for i in ["PassengerId", "Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]]

7-element Array{Float64,1}:
  0.029340156947652508
  1.0
 -0.3596526820868427
 -0.07722109457217734
 -0.01735836047953392
  0.09331700774224347
  0.2681886168744784

### 特徴量の選定

In [15]:
x = Matrix(train[: , [:Pclass, :Age, :SibSp, :Parch, :Fare]])
x[1:6, 1:5]

6×5 Array{Union{Missing, Float64},2}:
 3.0  22.0  1.0  0.0   7.25
 1.0  38.0  1.0  0.0  71.2833
 3.0  26.0  0.0  0.0   7.925
 1.0  35.0  1.0  0.0  53.1
 3.0  35.0  0.0  0.0   8.05
 1.0  54.0  0.0  0.0  51.8625

In [16]:
y = train[:, :Survived]
y[1:6]

6-element Array{Int64,1}:
 0
 1
 1
 1
 0
 0

In [17]:
test_ft = Matrix(test[: , [:Pclass, :Age, :SibSp, :Parch, :Fare]])
test_ft[1:6, 1:5]

6×5 Array{Union{Missing, Float64},2}:
 3.0  34.5  0.0  0.0   7.8292
 3.0  47.0  1.0  0.0   7.0
 2.0  62.0  0.0  0.0   9.6875
 3.0  27.0  0.0  0.0   8.6625
 3.0  22.0  1.0  1.0  12.2875
 3.0  14.0  0.0  0.0   9.225

### データの分割

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0);

### 学習モデルの作成

In [19]:
forest = RandomForestClassifier(n_estimators=100, random_state=0)

PyObject RandomForestClassifier(random_state=42)

In [20]:
fit!(forest, x_train, y_train)

PyObject RandomForestClassifier(random_state=42)

In [21]:
y_pred = predict(forest, x_test)
y_pred[1:6]

6-element Array{Int64,1}:
 1
 0
 0
 0
 0
 1

### 誤差

In [22]:
accuracy_score(y_test, y_pred)

0.6976744186046512

### 予測データの作成

In [23]:
pred = predict(forest, test_ft)
pred[1:6]

6-element Array{Int64,1}:
 0
 0
 1
 0
 0
 0

In [24]:
submission = DataFrame(load("gender_submission.csv"))
submission[:Survived] = pred
submission[1:6, :]

Unnamed: 0_level_0,PassengerId,Survived
Unnamed: 0_level_1,Int64,Int64
1,892,0
2,893,0
3,894,1
4,895,0
5,896,0
6,897,0


### csvの出力

In [25]:
submission |> CSV.write("submission.csv", delim=',', writeheader=true)

"submission.csv"

### 経過時間

In [26]:
elapsed_time = Dates.now() - start
println(elapsed_time)

30270 milliseconds
