Merge 7f48778 into 24ddd3c

MilesCranmer · Mar 20, 2023 · 0e931a6 · 0e931a6
2 parents 24ddd3c + 7f48778
commit 0e931a6
Show file tree

Hide file tree

Showing 27 changed files with 581 additions and 240 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SymbolicRegression"
 uuid = "8254be44-1295-4e6a-a16d-46603ac705cb"
 authors = ["MilesCranmer <miles.cranmer@gmail.com>"]
-version = "0.15.3"
+version = "0.16.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -23,7 +23,7 @@ SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [compat]
-DynamicExpressions = "0.4.2"
+DynamicExpressions = "0.5"
 JSON3 = "1"
 LineSearches = "7"
 LossFunctions = "0.6, 0.7, 0.8"

diff --git a/docs/make.jl b/docs/make.jl
@@ -2,8 +2,12 @@ using Documenter
 using SymbolicRegression
 using SymbolicRegression: Dataset, update_baseline_loss!
 
-DocMeta.setdocmeta!(SymbolicRegression, :DocTestSetup, :(using LossFunctions); recursive=true)
-DocMeta.setdocmeta!(SymbolicRegression, :DocTestSetup, :(using DynamicExpressions); recursive=true)
+DocMeta.setdocmeta!(
+    SymbolicRegression, :DocTestSetup, :(using LossFunctions); recursive=true
+)
+DocMeta.setdocmeta!(
+    SymbolicRegression, :DocTestSetup, :(using DynamicExpressions); recursive=true
+)
 
 readme = open(dirname(@__FILE__) * "/../README.md") do io
     read(io, String)
@@ -17,7 +21,10 @@ readme = replace(readme, r"<[/]?div.*" => s"")
 
 # Then, we surround ```mermaid\n...\n``` snippets
 # with ```@raw html\n<div class="mermaid">\n...\n</div>```:
-readme = replace(readme, r"```mermaid([^`]*)```" => s"```@raw html\n<div class=\"mermaid\">\n\1\n</div>\n```")
+readme = replace(
+    readme,
+    r"```mermaid([^`]*)```" => s"```@raw html\n<div class=\"mermaid\">\n\1\n</div>\n```",
+)
 
 # Then, we init mermaid.js:
 init_mermaid = """
@@ -52,6 +59,14 @@ makedocs(;
     format=Documenter.HTML(;
         canonical="https://astroautomata.com/SymbolicRegression.jl/stable"
     ),
+    pages=[
+        "Contents" => "index_base.md",
+        "Home" => "index.md",
+        "Examples" => "examples.md",
+        "API" => "api.md",
+        "Losses" => "losses.md",
+        "Types" => "types.md",
+    ],
 )
 
 # Next, we fix links in the docs/build/losses/index.html file:

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -10,8 +10,9 @@ EquationSearch(X::AbstractMatrix{T}, y::AbstractMatrix{T};
         options::Options=Options(),
         numprocs::Union{Int, Nothing}=nothing,
         procs::Union{Array{Int, 1}, Nothing}=nothing,
-        runtests::Bool=true
-       ) where {T<:Real}
+        runtests::Bool=true,
+        loss_type::Type=Nothing,
+) where {T<:DATA_TYPE}
 ```
 
 ## Options
@@ -59,8 +60,8 @@ node_to_symbolic(tree::Node, options::Options;
 
 ```@docs
 calculate_pareto_frontier(X::AbstractMatrix{T}, y::AbstractVector{T},
-                        hallOfFame::HallOfFame{T}, options::Options;
-                        weights=nothing, varMap=nothing) where {T<:Real}
-calculate_pareto_frontier(dataset::Dataset{T}, hallOfFame::HallOfFame{T},
-                          options::Options) where {T<:Real}
+                        hallOfFame::HallOfFame{T,L}, options::Options;
+                        weights=nothing, varMap=nothing) where {T<:DATA_TYPE,L<:LOSS_TYPE}
+calculate_pareto_frontier(dataset::Dataset{T,L}, hallOfFame::HallOfFame{T,L},
+                          options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE}
 ```
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -0,0 +1,158 @@
+# Toy Examples with Code
+
+## Preamble
+
+```julia
+using SymbolicRegression
+using DataFrames
+```
+
+We'll also code up a simple function to print a single expression:
+
+```julia
+function get_best(; X, y, hof::HallOfFame{T,L}, options) where {T,L}
+    dominating = calculate_pareto_frontier(X, y, hof, options)
+
+    df = DataFrame(;
+        tree=[m.tree for m in dominating],
+        loss=[m.loss for m in dominating],
+        complexity=[compute_complexity(m.tree, options) for m in dominating],
+    )
+
+    df[!, :score] = vcat(
+        [L(0.0)],
+        -1 .* log.(df.loss[2:end] ./ df.loss[1:(end - 1)]) ./
+        (df.complexity[2:end] .- df.complexity[1:(end - 1)]),
+    )
+
+    min_loss = min(df.loss...)
+
+    best_idx = argmax(df.score .* (df.loss .<= (2 * min_loss)))
+
+    return df.tree[best_idx], df
+end
+```
+
+## 1. Simple search
+
+Here's a simple example where we
+find the expression `2 cos(x3) + x0^2 - 2`.
+
+```julia
+X = 2randn(5, 1000)
+y = @. 2*cos(X[4, :]) + X[1, :]^2 - 2
+
+options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos])
+hof = EquationSearch(X, y; options=options, niterations=30)
+```
+
+Let's look at the most accurate expression:
+
+```julia
+best, df = get_best(; X, y, hof, options)
+println(best)
+```
+
+## 2. Custom operator
+
+Here, we define a custom operator and use it to find an expression:
+
+```julia
+X = 2randn(5, 1000)
+y = @. 1/X[1, :]
+
+options = Options(; binary_operators=[+, *], unary_operators=[inv])
+hof = EquationSearch(X, y; options=options)
+println(get_best(; X, y, hof, options)[1])
+```
+
+## 3. Multiple outputs
+
+Here, we do the same thing, but with multiple expressions at once,
+each requiring a different feature.
+
+```julia
+X = 2rand(5, 1000) .+ 0.1
+y = @. 1/X[1:3, :]
+options = Options(; binary_operators=[+, *], unary_operators=[inv])
+hofs = EquationSearch(X, y; options=options)
+bests = [get_best(; X, y=y[i, :], hof=hofs[i], options)[1] for i=1:3]
+println(bests)
+```
+
+## 4. Plotting an expression
+
+For now, let's consider the expressions for output 1.
+We can see the SymbolicUtils version with:
+
+```julia
+eqn = node_to_symbolic(bests[1], options)
+```
+
+We can get the LaTeX version with:
+
+```julia
+using Latexify
+latexify(string(eqn))
+```
+
+Let's plot the prediction against the truth:
+
+```julia
+using Plots
+
+scatter(y[1, :], bests[1](X), xlabel="Truth", ylabel="Prediction")
+```
+
+Here, we have used the convenience function `(::Node{T})(X)` to evaluate
+the expression. However, this will only work because calling `Options()`
+will automatically set up calls to `eval_tree_array`. In practice, you should
+use the `eval_tree_array` function directly, which is the form of:
+
+```julia
+eval_tree_array(bests[1], X, options)
+```
+
+## 5. Other types
+
+SymbolicRegression.jl can handle most numeric types you wish to use.
+For example, passing a `Float32` array will result in the search using
+32-bit precision everywhere in the codebase:
+
+```julia
+X = 2randn(Float32, 5, 1000)
+y = @. 2*cos(X[4, :]) + X[1, :]^2 - 2
+
+options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos])
+hof = EquationSearch(X, y; options=options, niterations=30)
+```
+
+we can see that the output types are `Float32`:
+
+```julia
+best, df = get_best(; X, y, hof, options)
+println(typeof(best))
+# Node{Float32}
+```
+
+We can also use `Complex` numbers:
+
+```julia
+cos_re(x::Complex{T}) where {T} = cos(abs(x)) + 0im
+
+X = 15 .* rand(ComplexF64, 5, 1000) .- 7.5
+y = @. 2*cos_re((2+1im) * X[4, :]) + 0.1 * X[1, :]^2 - 2
+
+options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos_re], maxsize=30)
+hof = EquationSearch(X, y; options=options, niterations=100)
+```
+
+## 6. Additional features
+
+For the many other features available in SymbolicRegression.jl,
+check out the API page for `Options`. You might also find it useful
+to browse the documentation for the Python frontend
+[PySR](http://astroautomata.com/PySR), which has additional documentation.
+In particular, the [tuning page](http://astroautomata.com/PySR/tuning)
+is useful for improving search performance.
+
diff --git a/docs/src/index_base.md b/docs/src/index_base.md
@@ -2,5 +2,5 @@
 # Contents
 
 ```@contents
-Pages = ["api.md", "types.md", "losses.md"]
+Pages = ["examples.md", "api.md", "types.md", "losses.md"]
 ```
diff --git a/docs/src/losses.md b/docs/src/losses.md
@@ -11,7 +11,7 @@ two (unweighted) or three (weighted) scalar arguments. For example,
 
 ```
 f(x, y, w) = abs(x-y)*w
-options = Options(loss=f)
+options = Options(elementwise_loss=f)
 ```
 
 ## Regression

diff --git a/docs/src/types.md b/docs/src/types.md
@@ -6,13 +6,13 @@ Equations are specified as binary trees with the `Node` type, defined
 as follows:
 
 ```@docs
-Node{T<:Real}
+Node{T<:DATA_TYPE}
 ```
 
 There are a variety of constructors for `Node` objects, including:
 
 ```@docs
-Node(; val::Real=nothing, feature::Integer=nothing)
+Node(; val::DATA_TYPE=nothing, feature::Integer=nothing)
 Node(op::Int, l::Node)
 Node(op::Int, l::Node, r::Node)
 Node(var_string::String)
@@ -55,38 +55,40 @@ an array of trees tagged with score, loss, and birthdate---these
 values are given in the `PopMember`.
 
 ```@docs
-Population(pop::Array{PopMember{T}, 1}) where {T<:Real}
-Population(dataset::Dataset{T};
+Population(pop::Array{PopMember{T,L}, 1}) where {T<:DATA_TYPE,L<:LOSS_TYPE}
+Population(dataset::Dataset{T,L};
            npop::Int, nlength::Int=3,
            options::Options,
-           nfeatures::Int) where {T<:Real}
+           nfeatures::Int) where {T<:DATA_TYPE,L<:LOSS_TYPE}
 Population(X::AbstractMatrix{T}, y::AbstractVector{T};
            npop::Int, nlength::Int=3,
            options::Options,
-           nfeatures::Int) where {T<:Real}
+           nfeatures::Int,
+           loss_type::Type=Nothing) where {T<:DATA_TYPE}
 ```
 
 ## Population members
 
 ```@docs
-PopMember(t::Node{T}, score::T, loss::T) where {T<:Real}
-PopMember(dataset::Dataset{T}, t::Node{T}, options::Options) where {T<:Real}
+PopMember(t::Node{T}, score::L, loss::L) where {T<:DATA_TYPE,L<:LOSS_TYPE}
+PopMember(dataset::Dataset{T,L}, t::Node{T}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE}
 ```
 
 ## Hall of Fame
 
 ```@docs
-HallOfFame(options::Options, ::Type{T}) where {T<:Real}
+HallOfFame(options::Options, ::Type{T}, ::Type{L}) where {T<:DATA_TYPE,L<:LOSS_TYPE}
 ```
 
 ## Dataset
 
 ```@docs
-Dataset{T<:Real}
+Dataset{T<:DATA_TYPE,L<:LOSS_TYPE}
 Dataset(X::AbstractMatrix{T},
         y::AbstractVector{T};
         weights::Union{AbstractVector{T}, Nothing}=nothing,
-        varMap::Union{Array{String, 1}, Nothing}=nothing
-       ) where {T<:Real}
-update_baseline_loss!(dataset::Dataset{T}, options::Options) where {T<:Real}
+        varMap::Union{Array{String, 1}, Nothing}=nothing,
+        loss_type::Type=Nothing,
+       ) where {T<:DATA_TYPE}
+update_baseline_loss!(dataset::Dataset{T,L}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE}
 ```
diff --git a/src/Configure.jl b/src/Configure.jl
@@ -50,7 +50,9 @@ function test_option_configuration(T, options::Options)
 end
 
 # Check for errors before they happen
-function test_dataset_configuration(dataset::Dataset{T}, options::Options) where {T<:Real}
+function test_dataset_configuration(
+    dataset::Dataset{T}, options::Options
+) where {T<:DATA_TYPE}
     n = dataset.n
     if n != size(dataset.X, 2) || n != size(dataset.y, 1)
         throw(
@@ -246,7 +248,9 @@ function test_module_on_workers(procs, options::Options)
     return debug((options.verbosity > 0 || options.progress), "Finished!")
 end
 
-function test_entire_pipeline(procs, dataset::Dataset{T}, options::Options) where {T<:Real}
+function test_entire_pipeline(
+    procs, dataset::Dataset{T}, options::Options
+) where {T<:DATA_TYPE}
     futures = []
     debug_inline(
         (options.verbosity > 0 || options.progress), "Testing entire pipeline on workers..."

diff --git a/src/ConstantOptimization.jl b/src/ConstantOptimization.jl
@@ -3,15 +3,15 @@ module ConstantOptimizationModule
 using LineSearches: LineSearches
 using Optim: Optim
 import DynamicExpressions: Node, get_constants, set_constants, count_constants
-import ..CoreModule: Options, Dataset
+import ..CoreModule: Options, Dataset, DATA_TYPE, LOSS_TYPE
 import ..UtilsModule: get_birth_order
 import ..LossFunctionsModule: score_func, eval_loss
 import ..PopMemberModule: PopMember
 
 # Proxy function for optimization
 function opt_func(
-    x::Vector{T}, dataset::Dataset{T}, tree::Node{T}, options::Options
-)::T where {T<:Real}
+    x::Vector{T}, dataset::Dataset{T,L}, tree::Node{T}, options::Options
+)::L where {T<:DATA_TYPE,L<:LOSS_TYPE}
     set_constants(tree, x)
     # TODO(mcranmer): This should use score_func batching.
     loss = eval_loss(tree, dataset, options)
@@ -20,16 +20,19 @@ end
 
 # Use Nelder-Mead to optimize the constants in an equation
 function optimize_constants(
-    dataset::Dataset{T}, member::PopMember{T}, options::Options
-)::Tuple{PopMember{T},Float64} where {T<:Real}
+    dataset::Dataset{T,L}, member::PopMember{T,L}, options::Options
+)::Tuple{PopMember{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE}
     nconst = count_constants(member.tree)
     num_evals = 0.0
     if nconst == 0
         return (member, 0.0)
     end
     x0 = get_constants(member.tree)
-    f(x::Vector{T})::T = opt_func(x, dataset, member.tree, options)
-    if nconst == 1
+    f(x::Vector{T})::L = opt_func(x, dataset, member.tree, options)
+    if T <: Complex
+        # TODO: Make this more general. Also, do we even need Newton here at all??
+        algorithm = Optim.BFGS(; linesearch=LineSearches.BackTracking())#order=3))
+    elseif nconst == 1
         algorithm = Optim.Newton(; linesearch=LineSearches.BackTracking())
     else
         if options.optimizer_algorithm == "NelderMead"