In [1]:
using DataFrames
using CSV
ENV["COLUMNS"] = 1000
using CategoricalArrays


using Pipe: @pipe
using Setfield

## Initial steps

In [2]:
df = CSV.File("data/train.csv") |> DataFrame

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Int64,Int64,String,String7,Float64?,Int64,Int64,String31,Float64,String15?,String1?
1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,missing,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,missing,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,missing,S
6,6,0,3,"Moran, Mr. James",male,missing,0,0,330877,8.4583,missing,Q
7,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,missing,S
9,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,missing,S
10,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,missing,C


In [3]:
names(df) |> show
describe(df) 

["PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked"]

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,Type
1,PassengerId,446.0,1,446.0,891,0,Int64
2,Survived,0.383838,0,0.0,1,0,Int64
3,Pclass,2.30864,1,3.0,3,0,Int64
4,Name,,"Abbing, Mr. Anthony",,"van Melkebeke, Mr. Philemon",0,String
5,Sex,,female,,male,0,String7
6,Age,29.6991,0.42,28.0,80.0,177,"Union{Missing, Float64}"
7,SibSp,0.523008,0,0.0,8,0,Int64
8,Parch,0.381594,0,0.0,6,0,Int64
9,Ticket,,110152,,WE/P 5735,0,String31
10,Fare,32.2042,0.0,14.4542,512.329,0,Float64


In [4]:
df[1:5,[:Survived,:Name]]

Unnamed: 0_level_0,Survived,Name
Unnamed: 0_level_1,Int64,String
1,0,"Braund, Mr. Owen Harris"
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
3,1,"Heikkinen, Miss. Laina"
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
5,0,"Allen, Mr. William Henry"


In [5]:
# List of recorded sexes
levels(df[!,:Sex])

2-element Vector{String7}:
 "female"
 "male"

In [6]:
# Convert Survived column to boolean
df[!,:Survived] = convert.(Bool,df[!,:Survived])

# Convert Sex column to boolean
if columnindex(df,:Sex) != 0
    (df[!,:Sex] = df[!,:Sex] .== "male")
    rename!(df,:Sex => :Male)
end 
df.Embarked = categorical(df.Embarked)

first(df,5)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Male,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Bool,Int64,String,Bool,Float64?,Int64,Int64,String31,Float64,String15?,Cat…?
1,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,missing,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",0,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,missing,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,missing,S


In [7]:
first(df,5)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Male,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Unnamed: 0_level_1,Int64,Bool,Int64,String,Bool,Float64?,Int64,Int64,String31,Float64,String15?,Cat…?
1,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,missing,S
2,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",0,38.0,1,0,PC 17599,71.2833,C85,C
3,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,missing,S
4,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
5,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,missing,S


### Missing value statistics

In [8]:
# Get missing stats
function show_missing(df)
    emptycols = count.(ismissing, eachcol(df))
    emptyperc = round.(emptycols ./ fill(size(df)...), digits = 3)
    sort(DataFrame(Column = names(df), Total = emptycols, Percent = emptyperc), :Total, rev = true)
end
show_missing(df)

Unnamed: 0_level_0,Column,Total,Percent
Unnamed: 0_level_1,String,Int64,Float64
1,Cabin,687,0.771
2,Age,177,0.199
3,Embarked,2,0.002
4,PassengerId,0,0.0
5,Survived,0,0.0
6,Pclass,0,0.0
7,Name,0,0.0
8,Male,0,0.0
9,SibSp,0,0.0
10,Parch,0,0.0


In [9]:
# TODO is it useful??
colwise(ismissing,df)
# count.(ismissing, eachcol(df))

LoadError: UndefVarError: colwise not defined

## Check cabins' information quality

In [10]:
cabin = @pipe dropmissing(df[:,[:Survived,:Cabin]]) |> sort(_,rev=true)
show(cabin, allrows=true)
cabin.Cabin = getindex.(cabin[:,:Cabin],1)

[1m204×2 DataFrame[0m
[1m Row [0m│[1m Survived [0m[1m Cabin           [0m
[1m     [0m│[90m Bool     [0m[90m String15        [0m
─────┼───────────────────────────
   1 │     true  G6
   2 │     true  G6
   3 │     true  F4
   4 │     true  F4
   5 │     true  F33
   6 │     true  F33
   7 │     true  F33
   8 │     true  F2
   9 │     true  F2
  10 │     true  F E69
  11 │     true  E8
  12 │     true  E8
  13 │     true  E68
  14 │     true  E67
  15 │     true  E50
  16 │     true  E49
  17 │     true  E44
  18 │     true  E40
  19 │     true  E36
  20 │     true  E34
  21 │     true  E33
  22 │     true  E33
  23 │     true  E25
  24 │     true  E25
  25 │     true  E24
  26 │     true  E24
  27 │     true  E17
  28 │     true  E121
  29 │     true  E121
  30 │     true  E12
  31 │     true  E101
  32 │     true  E101
  33 │     true  E101
  34 │     true  E10
  35 │     true  D9
  36 │     true  D7
  37 │     true  D56
  38 │     true  D49
  39 │     true  D47
  40 │  

204-element Vector{Char}:
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'G': ASCII/Unicode U+0047 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'F': ASCII/Unicode U+0046 (category Lu: Letter, uppercase)
 'E': ASCII/Unicode U+0045 (category Lu: Letter, uppercase)
 'E': ASCII/Unicode U+0045 (category Lu: Letter, uppercase)
 'E': ASCII/Unicode U+0045 (category Lu: Letter, uppercase)
 ⋮
 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase)
 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase)
 'B': ASCII/Unicode U+0042 (category Lu: Letter, uppercase)
 'B': ASCII

In [11]:
@pipe groupby(cabin, :Cabin) |>
      combine(_,
          :Survived => sum => :Survived,
          :Survived => (s -> count(s .== false)) => :Died,
          nrow,
          :Survived => (s -> round(sum(s) / count(s .== false), digits = 2)) => :Ratio
      ) |> sort(_, :Ratio, rev = true)

Unnamed: 0_level_0,Cabin,Survived,Died,nrow,Ratio
Unnamed: 0_level_1,Char,Int64,Int64,Int64,Float64
1,D,25,8,33,3.12
2,E,24,8,32,3.0
3,B,35,12,47,2.92
4,F,8,5,13,1.6
5,C,35,24,59,1.46
6,G,2,2,4,1.0
7,A,7,8,15,0.88
8,T,0,1,1,0.0


### Preping data for first exploratory model

In [12]:
df.FamilySize = df.SibSp .+ df.Parch
family_df = df[:, Not([:SibSp,:Parch])]
first(family_df)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Male,Age,Ticket,Fare,Cabin,Embarked,FamilySize
Unnamed: 0_level_1,Int64,Bool,Int64,String,Bool,Float64?,String31,Float64,String15?,Cat…?,Int64
1,1,0,3,"Braund, Mr. Owen Harris",1,22.0,A/5 21171,7.25,missing,S,1


In [91]:
sub_df = df[:, Not([:SibSp,:Parch,:Name,:Ticket,:Cabin,:PassengerId])]
if columnindex(df,:Age) != 0
    sub_df.Age = coalesce.(sub_df.Age, 99.5)
end 
first(sub_df,5) 


Unnamed: 0_level_0,Survived,Pclass,Male,Age,Fare,Embarked,FamilySize
Unnamed: 0_level_1,Bool,Int64,Bool,Float64,Float64,Cat…?,Int64
1,0,3,1,22.0,7.25,S,1
2,1,1,0,38.0,71.2833,C,1
3,1,3,0,26.0,7.925,S,0
4,1,1,0,35.0,53.1,S,1
5,0,3,1,35.0,8.05,S,0


In [14]:
dropmissing!(sub_df)
show_missing(sub_df)

Unnamed: 0_level_0,Column,Total,Percent
Unnamed: 0_level_1,String,Int64,Float64
1,Survived,0,0.0
2,Pclass,0,0.0
3,Male,0,0.0
4,Age,0,0.0
5,Fare,0,0.0
6,Embarked,0,0.0
7,FamilySize,0,0.0


### Rudimental Regression

In [34]:
using GLM
using StatsModels

In [64]:
using Lathe.preprocess: TrainTestSplit
train, test = TrainTestSplit(sub_df, .75);

In [65]:
# model = lm(@formula(Survived ~ Pclass + Male + Age + Fare + Embarked + FamilySize),sub_df)
# fit(LinearModel, @formula(Survived ~ Pclass + Male + Age + Fare + Embarked + FamilySize),sub_df)
logit = glm(@formula(Survived ~ Pclass + Male + Age + Fare + Embarked ), train, Binomial(), LogitLink())

StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Binomial{Float64}, LogitLink}, GLM.DensePredChol{Float64, Cholesky{Float64, Matrix{Float64}}}}, Matrix{Float64}}

Survived ~ 1 + Pclass + Male + Age + Fare + Embarked

Coefficients:
────────────────────────────────────────────────────────────────────────────────
                    Coef.  Std. Error       z  Pr(>|z|)   Lower 95%    Upper 95%
────────────────────────────────────────────────────────────────────────────────
(Intercept)   4.18229      0.5308        7.88    <1e-14   3.14195     5.22264
Pclass       -0.948593     0.147109     -6.45    <1e-09  -1.23692    -0.660264
Male         -2.53035      0.212685    -11.90    <1e-31  -2.9472     -2.11349
Age          -0.0109766    0.00366206   -3.00    0.0027  -0.0181541  -0.00379908
Fare          0.000452993  0.00233208    0.19    0.8460  -0.0041178   0.00502379
Embarked: Q   0.408517     0.429413      0.95    0.3414  -0.433117    1.25015
Embarked: S  -0

In [66]:
predict(logit,test)

223-element Vector{Union{Missing, Float64}}:
 0.10346871774204507
 0.867824072822779
 0.09256337439464964
 0.6066453533232123
 0.6034495004831595
 0.7663714744994469
 0.04699462736878972
 0.10451469516059732
 0.504487937743508
 0.40702950573168656
 0.09286901249401301
 0.776313758741114
 0.10660269140124018
 ⋮
  missing
 0.04699462736878972
 0.10555182386561965
 0.10873374168832396
 0.12446592654600636
 0.061237155482882714
 0.7763691531764296
 0.047024053434343085
 0.09949236965722393
 0.40520135505905536
 0.9344216592620453
 0.09281704730030903

In [67]:
train, test = TrainTestSplit(sub_df, .75);
mm = glm(@formula(Survived ~ Pclass    + Fare ), train, Binomial(), LogitLink());
mmage = glm(@formula(Survived ~ Pclass + Age  + Fare ), train, Binomial(), LogitLink());

pred = predict(model,test);

In [68]:
first(train,5)

Unnamed: 0_level_0,Survived,Pclass,Male,Age,Fare,Embarked,FamilySize
Unnamed: 0_level_1,Bool,Int64,Bool,Float64,Float64,Cat…?,Int64
1,0,3,1,22.0,7.25,S,1
2,1,1,0,38.0,71.2833,C,1
3,1,3,0,26.0,7.925,S,0
4,1,1,0,35.0,53.1,S,1
5,0,3,1,99.5,8.4583,Q,0


In [69]:
a = DataFrame(Ref=test[:,:Survived],Pred= round.(pred))
all = nrow(a)
good = count(a.Ref .== a.Pred)
good/all

0.6837209302325581

# DEBUG

In [47]:
using Tables
using StatsModels:missing_omit,modelcols
data = test
f1 = mm.mf.f 

f1.rhs


1Pclass(continuous)Fare(continuous)

In [56]:
cols1, _ = missing_omit(columntable(data), f1.rhs)
new_x = modelcols(f1.rhs, cols1)
x1 = reshape(new_x, size(new_x, 1), :)
show(x1)

[1.0 3.0 21.075; 1.0 2.0 30.0708; 1.0 3.0 31.275; 1.0 3.0 7.8542; 1.0 2.0 13.0; 1.0 2.0 26.0; 1.0 3.0 8.0292; 1.0 1.0 263.0; 1.0 3.0 7.8958; 1.0 3.0 7.75; 1.0 1.0 52.0; 1.0 3.0 7.2292; 1.0 3.0 8.05; 1.0 2.0 21.0; 1.0 3.0 7.75; 1.0 3.0 17.8; 1.0 2.0 26.0; 1.0 1.0 61.9792; 1.0 2.0 10.5; 1.0 2.0 27.75; 1.0 3.0 27.9; 1.0 3.0 7.925; 1.0 3.0 8.05; 1.0 2.0 29.0; 1.0 3.0 12.475; 1.0 3.0 9.0; 1.0 1.0 47.1; 1.0 2.0 10.5; 1.0 3.0 34.375; 1.0 3.0 20.575; 1.0 3.0 8.05; 1.0 1.0 34.6542; 1.0 3.0 7.925; 1.0 3.0 7.775; 1.0 3.0 14.4583; 1.0 3.0 7.75; 1.0 2.0 21.0; 1.0 3.0 31.275; 1.0 1.0 77.2875; 1.0 3.0 7.75; 1.0 3.0 7.05; 1.0 3.0 14.5; 1.0 2.0 15.0458; 1.0 3.0 9.2167; 1.0 2.0 36.75; 1.0 2.0 12.525; 1.0 3.0 8.05; 1.0 3.0 7.3125; 1.0 3.0 8.6625; 1.0 1.0 33.5; 1.0 3.0 7.8542; 1.0 2.0 13.0; 1.0 3.0 69.55; 1.0 3.0 22.025; 1.0 2.0 13.0; 1.0 1.0 146.5208; 1.0 3.0 7.75; 1.0 3.0 7.75; 1.0 3.0 7.225; 1.0 3.0 7.05; 1.0 3.0 7.25; 1.0 3.0 7.75; 1.0 1.0 76.2917; 1.0 2.0 13.0; 1.0 3.0 8.05; 1.0 1.0 90.0; 1.0 3.0 9.3

In [57]:
f2 = mmage.mf.f 
cols2, _ = StatsModels.missing_omit(columntable(data), f2.rhs)
new_x = modelcols(f2.rhs, cols2)
x2 = reshape(new_x, size(new_x, 1), :)
show(x2)

Real[1.0 3 2.0 21.075; 1.0 2 14.0 30.0708; 1.0 3 39.0 31.275; 1.0 3 14.0 7.8542; 1.0 2 100 13.0; 1.0 2 35.0 26.0; 1.0 3 15.0 8.0292; 1.0 1 19.0 263.0; 1.0 3 100 7.8958; 1.0 3 100 7.75; 1.0 1 42.0 52.0; 1.0 3 100 7.2292; 1.0 3 21.0 8.05; 1.0 2 27.0 21.0; 1.0 3 100 7.75; 1.0 3 18.0 17.8; 1.0 2 29.0 26.0; 1.0 1 65.0 61.9792; 1.0 2 21.0 10.5; 1.0 2 5.0 27.75; 1.0 3 4.0 27.9; 1.0 3 17.0 7.925; 1.0 3 100 8.05; 1.0 2 0.83 29.0; 1.0 3 30.0 12.475; 1.0 3 22.0 9.0; 1.0 1 28.0 47.1; 1.0 2 17.0 10.5; 1.0 3 16.0 34.375; 1.0 3 26.0 20.575; 1.0 3 100 8.05; 1.0 1 71.0 34.6542; 1.0 3 37.0 7.925; 1.0 3 100 7.775; 1.0 3 17.0 14.4583; 1.0 3 70.5 7.75; 1.0 2 29.0 21.0; 1.0 3 2.0 31.275; 1.0 1 54.0 77.2875; 1.0 3 100 7.75; 1.0 3 20.0 7.05; 1.0 3 47.0 14.5; 1.0 2 23.0 15.0458; 1.0 3 16.0 9.2167; 1.0 2 19.0 36.75; 1.0 2 51.0 12.525; 1.0 3 55.5 8.05; 1.0 3 100 7.3125; 1.0 3 100 8.6625; 1.0 1 61.0 33.5; 1.0 3 18.0 7.8542; 1.0 2 30.0 13.0; 1.0 3 100 69.55; 1.0 3 4.0 22.025; 1.0 2 19.0 13.0; 1.0 1 58.0 146.5208; 

In [81]:
typeof(sub_df.Age)

Vector{Real} (alias for Array{Real, 1})

In [82]:
sub_df.Age::Vector{Float64} = coalesce.(sub_df.Age, 100)


LoadError: TypeError: in typeassert, expected Vector{Float64}, got a value of type Vector{Real}