diff --git a/Project.toml b/Project.toml index cc5d8426..dea1e2f4 100644 --- a/Project.toml +++ b/Project.toml @@ -33,7 +33,7 @@ OnlineStats = "1" OnlineStatsBase = "1" PooledArrays = "0.5.2,1" RecipesBase = "0.7,1" -StatsBase = "0.32,0.33" +StatsBase = "0.32,0.33,0.33.8" TextParse = "0.9.1,1" WeakRefStrings = "0.6" julia = "1" diff --git a/docs/src/ml.md b/docs/src/ml.md index 721c195a..27a08e30 100644 --- a/docs/src/ml.md +++ b/docs/src/ml.md @@ -10,7 +10,7 @@ using JuliaDB download("https://raw.githubusercontent.com/agconti/"* "kaggle-titanic/master/data/train.csv", "train.csv") -train_table = loadtable("train.csv", escapechar='"') +train_table = dropmissing(loadtable("train.csv", escapechar='"')) select(train_table, Not((:Name, :Ticket, :Cabin))) # hide ``` @@ -42,9 +42,6 @@ You may note that `Survived` column contains only 1s and 0s to denote whether a sch = ML.schema(train_table, hints=Dict( :Pclass => ML.Categorical, :Survived => ML.Categorical, - :Parch => nothing, - :SibSp => nothing, - :Fare => nothing, ) ) ``` @@ -62,11 +59,11 @@ input_sch, output_sch = ML.splitschema(sch, :Survived) Once the schema has been created, you can extract the feature matrix according to the given schema using `ML.featuremat`: ```@example titanic -train_input = ML.featuremat(input_sch, train_table) +train_input = ML.featuremat(input_sch, collect(train_table)) ``` ```@example titanic -train_output = ML.featuremat(output_sch, train_table) +train_output = ML.featuremat(output_sch, collect(train_table)) ``` ## Learning @@ -78,22 +75,23 @@ Let us create a simple neural network to learn whether a passenger will survive ```@example titanic using Flux +data = [(train_input, train_output)] + model = Chain( Dense(ML.width(input_sch), 32, relu), Dense(32, ML.width(output_sch)), softmax) loss(x, y) = Flux.mse(model(x), y) -opt = Flux.ADAM(Flux.params(model)) +opt = Flux.ADAM() evalcb = Flux.throttle(() -> @show(loss(first(data)...)), 2); ``` Train the data in 10 iterations ```@example titanic -data = [(train_input, train_output)] for i = 1:10 - Flux.train!(loss, data, opt, cb = evalcb) + Flux.train!(loss, Flux.params(model),data, opt, cb = evalcb) end ``` @@ -108,9 +106,9 @@ Now let's load some testing data to use the model we learned to predict survival download("https://raw.githubusercontent.com/agconti/"* "kaggle-titanic/master/data/test.csv", "test.csv") -test_table = loadtable("test.csv", escapechar='"') +test_table = dropmissing(loadtable("test.csv", escapechar='"')) -test_input = ML.featuremat(input_sch, test_table) ; +test_input = ML.featuremat(input_sch, collect(test_table)) ; ``` Run the model on one observation: diff --git a/src/ml.jl b/src/ml.jl index 0d2e9d06..c0f4df70 100644 --- a/src/ml.jl +++ b/src/ml.jl @@ -153,8 +153,8 @@ function featuremat!(A, schemas::Schema, t::Dataset) end splitschema(xs::Schema, ks...) = - filter((k,v) -> k ∉ ks, xs), - filter((k,v) -> k ∈ ks, xs) + filter(k -> k.first ∉ ks, xs), + filter(k -> k.first ∈ ks, xs) function featuremat(sch, xs) featuremat!(zeros(Float32, length(xs), width(sch)), sch, xs)'