From 30a14c78eb9d2e89f002da0b1ce1117be1646574 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 16 Mar 2023 21:04:25 -0400 Subject: [PATCH 01/19] Bump backend version with complex numbers fix --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 07cbbf410..6c5da5ba1 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [compat] -DynamicExpressions = "0.4.2" +DynamicExpressions = "0.4.3" JSON3 = "1" LineSearches = "7" LossFunctions = "0.6, 0.7, 0.8" From 5391a82d277b6f153350e25fedf31845e799a5d8 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 16 Mar 2023 22:10:58 -0400 Subject: [PATCH 02/19] Split element types into one for data and loss --- docs/src/api.md | 11 ++++---- docs/src/types.md | 28 +++++++++++---------- src/Configure.jl | 8 ++++-- src/ConstantOptimization.jl | 12 ++++----- src/Core.jl | 3 ++- src/Dataset.jl | 21 ++++++++++------ src/HallOfFame.jl | 42 +++++++++++++++++-------------- src/LossFunctions.jl | 50 +++++++++++++++++++++---------------- src/Migration.jl | 8 +++--- src/Mutate.jl | 18 ++++++------- src/MutationFunctions.jl | 22 ++++++++-------- src/Operators.jl | 14 ++++++----- src/PopMember.jl | 30 +++++++++++----------- src/Population.jl | 43 ++++++++++++++++--------------- src/ProgramConstants.jl | 3 +++ src/RegularizedEvolution.jl | 10 ++++---- src/SearchUtils.jl | 46 ++++++++++++++++++---------------- src/SingleIteration.jl | 20 +++++++-------- src/SymbolicRegression.jl | 38 +++++++++++++++------------- 19 files changed, 233 insertions(+), 194 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 2ab6004e7..480317a3a 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -10,8 +10,9 @@ EquationSearch(X::AbstractMatrix{T}, y::AbstractMatrix{T}; options::Options=Options(), numprocs::Union{Int, Nothing}=nothing, procs::Union{Array{Int, 1}, Nothing}=nothing, - runtests::Bool=true - ) where {T<:Real} + runtests::Bool=true, + loss_type::Type=Nothing, +) where {T<:Number} ``` ## Options @@ -59,8 +60,8 @@ node_to_symbolic(tree::Node, options::Options; ```@docs calculate_pareto_frontier(X::AbstractMatrix{T}, y::AbstractVector{T}, - hallOfFame::HallOfFame{T}, options::Options; - weights=nothing, varMap=nothing) where {T<:Real} + hallOfFame::HallOfFame{T,L}, options::Options; + weights=nothing, varMap=nothing) where {T<:DATA_TYPE,L<:LOSS_TYPE} calculate_pareto_frontier(dataset::Dataset{T}, hallOfFame::HallOfFame{T}, - options::Options) where {T<:Real} + options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} ``` diff --git a/docs/src/types.md b/docs/src/types.md index 0be4f6c60..de9ec2a9a 100644 --- a/docs/src/types.md +++ b/docs/src/types.md @@ -6,13 +6,13 @@ Equations are specified as binary trees with the `Node` type, defined as follows: ```@docs -Node{T<:Real} +Node{T<:DATA_TYPE} ``` There are a variety of constructors for `Node` objects, including: ```@docs -Node(; val::Real=nothing, feature::Integer=nothing) +Node(; val::DATA_TYPE=nothing, feature::Integer=nothing) Node(op::Int, l::Node) Node(op::Int, l::Node, r::Node) Node(var_string::String) @@ -55,38 +55,40 @@ an array of trees tagged with score, loss, and birthdate---these values are given in the `PopMember`. ```@docs -Population(pop::Array{PopMember{T}, 1}) where {T<:Real} -Population(dataset::Dataset{T}; +Population(pop::Array{PopMember{T,L}, 1}) where {T<:DATA_TYPE,L<:LOSS_TYPE} +Population(dataset::Dataset{T,L}; npop::Int, nlength::Int=3, options::Options, - nfeatures::Int) where {T<:Real} + nfeatures::Int) where {T<:DATA_TYPE,L<:LOSS_TYPE} Population(X::AbstractMatrix{T}, y::AbstractVector{T}; npop::Int, nlength::Int=3, options::Options, - nfeatures::Int) where {T<:Real} + nfeatures::Int, + loss_type::Type=Nothing) where {T<:DATA_TYPE} ``` ## Population members ```@docs -PopMember(t::Node{T}, score::T, loss::T) where {T<:Real} -PopMember(dataset::Dataset{T}, t::Node{T}, options::Options) where {T<:Real} +PopMember(t::Node{T}, score::L, loss::L) where {T<:DATA_TYPE,L<:LOSS_TYPE} +PopMember(dataset::Dataset{T,L}, t::Node{T}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} ``` ## Hall of Fame ```@docs -HallOfFame(options::Options, ::Type{T}) where {T<:Real} +HallOfFame(options::Options, ::Type{T}, ::Type{L}) where {T<:DATA_TYPE,L<:LOSS_TYPE} ``` ## Dataset ```@docs -Dataset{T<:Real} +Dataset{T<:DATA_TYPE,L<:LOSS_TYPE} Dataset(X::AbstractMatrix{T}, y::AbstractVector{T}; weights::Union{AbstractVector{T}, Nothing}=nothing, - varMap::Union{Array{String, 1}, Nothing}=nothing - ) where {T<:Real} -update_baseline_loss!(dataset::Dataset{T}, options::Options) where {T<:Real} + varMap::Union{Array{String, 1}, Nothing}=nothing, + loss_type::Type=Nothing, + ) where {T<:DATA_TYPE} +update_baseline_loss!(dataset::Dataset{T,L}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} ``` diff --git a/src/Configure.jl b/src/Configure.jl index eb3f85a22..a4fe63bbf 100644 --- a/src/Configure.jl +++ b/src/Configure.jl @@ -50,7 +50,9 @@ function test_option_configuration(T, options::Options) end # Check for errors before they happen -function test_dataset_configuration(dataset::Dataset{T}, options::Options) where {T<:Real} +function test_dataset_configuration( + dataset::Dataset{T}, options::Options +) where {T<:DATA_TYPE} n = dataset.n if n != size(dataset.X, 2) || n != size(dataset.y, 1) throw( @@ -246,7 +248,9 @@ function test_module_on_workers(procs, options::Options) return debug((options.verbosity > 0 || options.progress), "Finished!") end -function test_entire_pipeline(procs, dataset::Dataset{T}, options::Options) where {T<:Real} +function test_entire_pipeline( + procs, dataset::Dataset{T}, options::Options +) where {T<:DATA_TYPE} futures = [] debug_inline( (options.verbosity > 0 || options.progress), "Testing entire pipeline on workers..." diff --git a/src/ConstantOptimization.jl b/src/ConstantOptimization.jl index 1c7993c6a..6f1a96c5e 100644 --- a/src/ConstantOptimization.jl +++ b/src/ConstantOptimization.jl @@ -3,15 +3,15 @@ module ConstantOptimizationModule using LineSearches: LineSearches using Optim: Optim import DynamicExpressions: Node, get_constants, set_constants, count_constants -import ..CoreModule: Options, Dataset +import ..CoreModule: Options, Dataset, DATA_TYPE, LOSS_TYPE import ..UtilsModule: get_birth_order import ..LossFunctionsModule: score_func, eval_loss import ..PopMemberModule: PopMember # Proxy function for optimization function opt_func( - x::Vector{T}, dataset::Dataset{T}, tree::Node{T}, options::Options -)::T where {T<:Real} + x::Vector{T}, dataset::Dataset{T,L}, tree::Node{T}, options::Options +)::L where {T<:DATA_TYPE,L<:LOSS_TYPE} set_constants(tree, x) # TODO(mcranmer): This should use score_func batching. loss = eval_loss(tree, dataset, options) @@ -20,15 +20,15 @@ end # Use Nelder-Mead to optimize the constants in an equation function optimize_constants( - dataset::Dataset{T}, member::PopMember{T}, options::Options -)::Tuple{PopMember{T},Float64} where {T<:Real} + dataset::Dataset{T,L}, member::PopMember{T,L}, options::Options +)::Tuple{PopMember{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} nconst = count_constants(member.tree) num_evals = 0.0 if nconst == 0 return (member, 0.0) end x0 = get_constants(member.tree) - f(x::Vector{T})::T = opt_func(x, dataset, member.tree, options) + f(x::Vector{T})::L = opt_func(x, dataset, member.tree, options) if nconst == 1 algorithm = Optim.Newton(; linesearch=LineSearches.BackTracking()) else diff --git a/src/Core.jl b/src/Core.jl index 186df51ca..50f6b262d 100644 --- a/src/Core.jl +++ b/src/Core.jl @@ -7,7 +7,8 @@ include("OptionsStruct.jl") include("Operators.jl") include("Options.jl") -import .ProgramConstantsModule: MAX_DEGREE, BATCH_DIM, FEATURE_DIM, RecordType +import .ProgramConstantsModule: + MAX_DEGREE, BATCH_DIM, FEATURE_DIM, RecordType, DATA_TYPE, LOSS_TYPE import .DatasetModule: Dataset import .OptionsStructModule: Options, MutationWeights, sample_mutation import .OptionsModule: Options diff --git a/src/Dataset.jl b/src/Dataset.jl index 58913bff6..750b2db83 100644 --- a/src/Dataset.jl +++ b/src/Dataset.jl @@ -1,9 +1,9 @@ module DatasetModule -import ..ProgramConstantsModule: BATCH_DIM, FEATURE_DIM +import ..ProgramConstantsModule: BATCH_DIM, FEATURE_DIM, DATA_TYPE, LOSS_TYPE """ - Dataset{T<:Real} + Dataset{T<:DATA_TYPE,L<:LOSS_TYPE} # Fields @@ -21,7 +21,7 @@ import ..ProgramConstantsModule: BATCH_DIM, FEATURE_DIM - `varMap::Array{String,1}`: The names of the features, with shape `(nfeatures,)`. """ -mutable struct Dataset{T<:Real} +mutable struct Dataset{T<:DATA_TYPE,L<:LOSS_TYPE} X::AbstractMatrix{T} y::AbstractVector{T} n::Int @@ -29,14 +29,15 @@ mutable struct Dataset{T<:Real} weighted::Bool weights::Union{AbstractVector{T},Nothing} avg_y::T - baseline_loss::T + baseline_loss::L varMap::Array{String,1} end """ Dataset(X::AbstractMatrix{T}, y::AbstractVector{T}; weights::Union{AbstractVector{T}, Nothing}=nothing, - varMap::Union{Array{String, 1}, Nothing}=nothing) + varMap::Union{Array{String, 1}, Nothing}=nothing, + loss_type::Type=Nothing) Construct a dataset to pass between internal functions. """ @@ -45,7 +46,8 @@ function Dataset( y::AbstractVector{T}; weights::Union{AbstractVector{T},Nothing}=nothing, varMap::Union{Array{String,1},Nothing}=nothing, -) where {T<:Real} + loss_type::Type=Nothing, +) where {T<:DATA_TYPE} Base.require_one_based_indexing(X, y) n = size(X, BATCH_DIM) nfeatures = size(X, FEATURE_DIM) @@ -58,9 +60,12 @@ function Dataset( else sum(y) / n end - baseline = one(T) + loss_type = (loss_type == Nothing) ? T : loss_type + baseline = one(loss_type) - return Dataset{T}(X, y, n, nfeatures, weighted, weights, avg_y, baseline, varMap) + return Dataset{T,loss_type}( + X, y, n, nfeatures, weighted, weights, avg_y, baseline, varMap + ) end end diff --git a/src/HallOfFame.jl b/src/HallOfFame.jl index d4d2ff35a..9922fc4a7 100644 --- a/src/HallOfFame.jl +++ b/src/HallOfFame.jl @@ -1,22 +1,22 @@ module HallOfFameModule import DynamicExpressions: Node, string_tree -import ..CoreModule: MAX_DEGREE, Options, Dataset +import ..CoreModule: MAX_DEGREE, Options, Dataset, DATA_TYPE, LOSS_TYPE import ..ComplexityModule: compute_complexity import ..PopMemberModule: PopMember, copy_pop_member import ..LossFunctionsModule: eval_loss using Printf: @sprintf """ List of the best members seen all time in `.members` """ -mutable struct HallOfFame{T<:Real} - members::Array{PopMember{T},1} +mutable struct HallOfFame{T<:DATA_TYPE,L<:LOSS_TYPE} + members::Array{PopMember{T,L},1} exists::Array{Bool,1} #Whether it has been set # Arranged by complexity - store one at each. end """ - HallOfFame(options::Options, ::Type{T}) where {T<:Real} + HallOfFame(options::Options, ::Type{T}, ::Type{L}) where {T<:DATA_TYPE,L<:LOSS_TYPE} Create empty HallOfFame. The HallOfFame stores a list of `PopMember` objects in `.members`, which is enumerated @@ -28,14 +28,16 @@ Arguments: - `options`: Options containing specification about deterministic. - `T`: Type of Nodes to use in the population. e.g., `Float64`. """ -function HallOfFame(options::Options, ::Type{T}) where {T<:Real} +function HallOfFame( + options::Options, ::Type{T}, ::Type{L} +) where {T<:DATA_TYPE,L<:LOSS_TYPE} actualMaxsize = options.maxsize + MAX_DEGREE - return HallOfFame( + return HallOfFame{T,L}( [ PopMember( Node(; val=convert(T, 1)), - T(0), - T(Inf); + L(0), + L(Inf); parent=-1, deterministic=options.deterministic, ) for i in 1:actualMaxsize @@ -44,7 +46,9 @@ function HallOfFame(options::Options, ::Type{T}) where {T<:Real} ) end -function copy_hall_of_fame(hof::HallOfFame{T})::HallOfFame{T} where {T<:Real} +function copy_hall_of_fame( + hof::HallOfFame{T,L} +)::HallOfFame{T,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} return HallOfFame( [copy_pop_member(member) for member in hof.members], [exists for exists in hof.exists], @@ -52,15 +56,15 @@ function copy_hall_of_fame(hof::HallOfFame{T})::HallOfFame{T} where {T<:Real} end """ - calculate_pareto_frontier(dataset::Dataset{T}, hallOfFame::HallOfFame{T}, - options::Options) where {T<:Real} + calculate_pareto_frontier(dataset::Dataset{T,L}, hallOfFame::HallOfFame{T,L}, + options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} """ function calculate_pareto_frontier( - dataset::Dataset{T}, hallOfFame::HallOfFame{T}, options::Options -)::Vector{PopMember{T}} where {T<:Real} + dataset::Dataset{T,L}, hallOfFame::HallOfFame{T,L}, options::Options +)::Vector{PopMember{T,L}} where {T<:DATA_TYPE,L<:LOSS_TYPE} # TODO - remove dataset from args. # Dominating pareto curve - must be better than all simpler equations - dominating = PopMember{T}[] + dominating = PopMember{T,L}[] actualMaxsize = options.maxsize + MAX_DEGREE for size in 1:actualMaxsize if !hallOfFame.exists[size] @@ -89,8 +93,8 @@ end """ calculate_pareto_frontier(X::AbstractMatrix{T}, y::AbstractVector{T}, - hallOfFame::HallOfFame{T}, options::Options; - weights=nothing, varMap=nothing) where {T<:Real} + hallOfFame::HallOfFame{T,L}, options::Options; + weights=nothing, varMap=nothing) where {T<:DATA_TYPE,L<:LOSS_TYPE} Compute the dominating Pareto frontier for a given hallOfFame. This is the list of equations where each equation has a better loss than all @@ -99,13 +103,13 @@ simpler equations. function calculate_pareto_frontier( X::AbstractMatrix{T}, y::AbstractVector{T}, - hallOfFame::HallOfFame{T}, + hallOfFame::HallOfFame{T,L}, options::Options; weights=nothing, varMap=nothing, -)::Vector{PopMember{T}} where {T<:Real} +)::Vector{PopMember{T,L}} where {T<:DATA_TYPE,L<:LOSS_TYPE} return calculate_pareto_frontier( - Dataset(X, y; weights=weights, varMap=varMap), hallOfFame, options + Dataset(X, y; weights=weights, varMap=varMap, loss_type=L), hallOfFame, options ) end diff --git a/src/LossFunctions.jl b/src/LossFunctions.jl index ca780ad28..19b8aab70 100644 --- a/src/LossFunctions.jl +++ b/src/LossFunctions.jl @@ -5,36 +5,40 @@ using StatsBase: StatsBase import LossFunctions: value, AggMode, SupervisedLoss import DynamicExpressions: Node import ..InterfaceDynamicExpressionsModule: eval_tree_array -import ..CoreModule: Options, Dataset +import ..CoreModule: Options, Dataset, DATA_TYPE, LOSS_TYPE import ..ComplexityModule: compute_complexity function _loss( x::AbstractArray{T}, y::AbstractArray{T}, loss::SupervisedLoss -)::T where {T<:Real} +) where {T<:DATA_TYPE} return value(loss, y, x, AggMode.Mean()) end -function _loss(x::AbstractArray{T}, y::AbstractArray{T}, loss::Function)::T where {T<:Real} +function _loss( + x::AbstractArray{T}, y::AbstractArray{T}, loss::Function +) where {T<:DATA_TYPE} return sum(loss.(x, y)) / length(y) end function _weighted_loss( x::AbstractArray{T}, y::AbstractArray{T}, w::AbstractArray{T}, loss::SupervisedLoss -)::T where {T<:Real} +) where {T<:DATA_TYPE} return value(loss, y, x, AggMode.WeightedMean(w)) end function _weighted_loss( x::AbstractArray{T}, y::AbstractArray{T}, w::AbstractArray{T}, loss::Function -)::T where {T<:Real} +) where {T<:DATA_TYPE} return sum(loss.(x, y, w)) / sum(w) end # Evaluate the loss of a particular expression on the input dataset. -function _eval_loss(tree::Node{T}, dataset::Dataset{T}, options::Options)::T where {T<:Real} +function _eval_loss( + tree::Node{T}, dataset::Dataset{T,L}, options::Options +)::L where {T<:DATA_TYPE,L<:LOSS_TYPE} (prediction, completion) = eval_tree_array(tree, dataset.X, options) if !completion - return T(Inf) + return L(Inf) end if dataset.weighted @@ -51,13 +55,15 @@ end # This evaluates function F: function evaluator( - f::F, tree::Node{T}, dataset::Dataset{T}, options::Options -)::T where {T<:Real,F} + f::F, tree::Node{T}, dataset::Dataset{T,L}, options::Options +)::L where {T<:DATA_TYPE,L<:LOSS_TYPE,F} return f(tree, dataset, options) end # Evaluate the loss of a particular expression on the input dataset. -function eval_loss(tree::Node{T}, dataset::Dataset{T}, options::Options)::T where {T<:Real} +function eval_loss( + tree::Node{T}, dataset::Dataset{T,L}, options::Options +)::L where {T<:DATA_TYPE,L<:LOSS_TYPE} if options.loss_function === nothing return _eval_loss(tree, dataset, options) else @@ -68,10 +74,10 @@ end # Compute a score which includes a complexity penalty in the loss function loss_to_score( - loss::T, baseline::T, tree::Node{T}, options::Options -)::T where {T<:Real} - normalization = if baseline < T(0.01) - T(0.01) + loss::L, baseline::L, tree::Node{T}, options::Options +)::L where {T<:DATA_TYPE,L<:LOSS_TYPE} + normalization = if baseline < L(0.01) + L(0.01) else baseline end @@ -84,8 +90,8 @@ end # Score an equation function score_func( - dataset::Dataset{T}, tree::Node{T}, options::Options -)::Tuple{T,T} where {T<:Real} + dataset::Dataset{T,L}, tree::Node{T}, options::Options +)::Tuple{L,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} result_loss = eval_loss(tree, dataset, options) score = loss_to_score(result_loss, dataset.baseline_loss, tree, options) return score, result_loss @@ -93,14 +99,14 @@ end # Score an equation with a small batch function score_func_batch( - dataset::Dataset{T}, tree::Node{T}, options::Options -)::Tuple{T,T} where {T<:Real} + dataset::Dataset{T,L}, tree::Node{T}, options::Options +)::Tuple{L,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} batch_idx = StatsBase.sample(1:(dataset.n), options.batch_size; replace=true) batch_X = view(dataset.X, :, batch_idx) batch_y = view(dataset.y, batch_idx) (prediction, completion) = eval_tree_array(tree, batch_X, options) if !completion - return T(0), T(Inf) + return L(0), L(Inf) end if !dataset.weighted @@ -115,11 +121,13 @@ function score_func_batch( end """ - update_baseline_loss!(dataset::Dataset{T}, options::Options) where {T<:Real} + update_baseline_loss!(dataset::Dataset{T,L}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} Update the baseline loss of the dataset using the loss function specified in `options`. """ -function update_baseline_loss!(dataset::Dataset{T}, options::Options) where {T<:Real} +function update_baseline_loss!( + dataset::Dataset{T,L}, options::Options +) where {T<:DATA_TYPE,L<:LOSS_TYPE} example_tree = Node(T; val=dataset.avg_y) dataset.baseline_loss = eval_loss(example_tree, dataset, options) return nothing diff --git a/src/Migration.jl b/src/Migration.jl index c1f002db5..705d986d0 100644 --- a/src/Migration.jl +++ b/src/Migration.jl @@ -1,22 +1,22 @@ module MigrationModule using StatsBase: StatsBase -import ..CoreModule: Options +import ..CoreModule: Options, DATA_TYPE, LOSS_TYPE import ..PopulationModule: Population import ..PopMemberModule: PopMember, copy_pop_member_reset_birth """ - migrate!(migration::Pair{Population{T},Population{T}}, options::Options; frac::AbstractFloat) + migrate!(migration::Pair{Population{T,L},Population{T,L}}, options::Options; frac::AbstractFloat) Migrate a fraction of the population from one population to the other, creating copies to do so. The original migrant population is not modified. Pass with, e.g., `migrate!(migration_candidates => destination, options; frac=0.1)` """ function migrate!( - migration::Pair{Vector{PopMember{T}},Population{T}}, + migration::Pair{Vector{PopMember{T,L}},Population{T,L}}, options::Options; frac::AbstractFloat, -) where {T} +) where {T<:DATA_TYPE,L<:LOSS_TYPE} base_pop = migration.second npop = length(base_pop.members) num_replace = round(Int, npop * frac) diff --git a/src/Mutate.jl b/src/Mutate.jl index 42c340ef1..ca2821fd7 100644 --- a/src/Mutate.jl +++ b/src/Mutate.jl @@ -2,7 +2,7 @@ module MutateModule import DynamicExpressions: Node, copy_node, count_constants, simplify_tree, combine_operators -import ..CoreModule: Options, Dataset, RecordType, sample_mutation +import ..CoreModule: Options, Dataset, RecordType, sample_mutation, DATA_TYPE, LOSS_TYPE import ..ComplexityModule: compute_complexity import ..LossFunctionsModule: score_func, score_func_batch import ..CheckConstraintsModule: check_constraints @@ -23,14 +23,14 @@ import ..RecorderModule: @recorder # Go through one simulated options.annealing mutation cycle # exp(-delta/T) defines probability of accepting a change function next_generation( - dataset::Dataset{T}, - member::PopMember{T}, - temperature::T, + dataset::Dataset{T,L}, + member::PopMember{T,L}, + temperature, curmaxsize::Int, running_search_statistics::RunningSearchStatistics, options::Options; tmp_recorder::RecordType, -)::Tuple{PopMember{T},Bool,Float64} where {T<:Real} +)::Tuple{PopMember{T,L},Bool,Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} prev = member.tree parent_ref = member.ref tree = prev @@ -282,12 +282,12 @@ end """Generate a generation via crossover of two members.""" function crossover_generation( - member1::PopMember, - member2::PopMember, - dataset::Dataset{T}, + member1::PopMember{T,L}, + member2::PopMember{T,L}, + dataset::Dataset{T,L}, curmaxsize::Int, options::Options, -)::Tuple{PopMember,PopMember,Bool,Float64} where {T<:Real} +)::Tuple{PopMember{T,L},PopMember{T,L},Bool,Float64} where {T<:DATA_TYPE,L<:DATA_TYPE} tree1 = member1.tree tree2 = member2.tree crossover_accepted = false diff --git a/src/MutationFunctions.jl b/src/MutationFunctions.jl index a829bcf52..50931f4a9 100644 --- a/src/MutationFunctions.jl +++ b/src/MutationFunctions.jl @@ -2,7 +2,7 @@ module MutationFunctionsModule import DynamicExpressions: Node, copy_node, set_node!, count_nodes, has_constants, has_operators -import ..CoreModule: Options +import ..CoreModule: Options, DATA_TYPE # Return a random node from the tree function random_node(tree::Node{T})::Node{T} where {T} @@ -48,8 +48,8 @@ end # Randomly perturb a constant function mutate_constant( - tree::Node{T}, temperature::T, options::Options -)::Node{T} where {T<:Real} + tree::Node{T}, temperature::L, options::Options +)::Node{T} where {T<:DATA_TYPE,L<:DATA_TYPE} # T is between 0 and 1. if !(has_constants(tree)) @@ -61,7 +61,7 @@ function mutate_constant( end bottom = 1//10 - maxChange = T(options.perturbation_factor) * temperature + T(1 + bottom) + maxChange = L(options.perturbation_factor) * temperature + L(1 + bottom) factor = maxChange^rand(T) makeConstBigger = rand() > 0.5 @@ -84,7 +84,7 @@ function append_random_op( options::Options, nfeatures::Int; makeNewBinOp::Union{Bool,Nothing}=nothing, -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} node = random_node(tree) while node.degree != 0 node = random_node(tree) @@ -113,7 +113,7 @@ end # Insert random node function insert_random_op( tree::Node{T}, options::Options, nfeatures::Int -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} node = random_node(tree) choice = rand() makeNewBinOp = choice < options.nbin / (options.nuna + options.nbin) @@ -132,7 +132,7 @@ end # Add random node to the top of a tree function prepend_random_op( tree::Node{T}, options::Options, nfeatures::Int -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} node = tree choice = rand() makeNewBinOp = choice < options.nbin / (options.nuna + options.nbin) @@ -148,7 +148,7 @@ function prepend_random_op( return node end -function make_random_leaf(nfeatures::Int, ::Type{T})::Node{T} where {T<:Real} +function make_random_leaf(nfeatures::Int, ::Type{T})::Node{T} where {T<:DATA_TYPE} if rand() > 0.5 return Node(; val=randn(T)) else @@ -192,7 +192,7 @@ end # with a variable or constant function delete_random_op( tree::Node{T}, options::Options, nfeatures::Int -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} node, parent, side = random_node_and_parent(tree) isroot = (parent === nothing) @@ -235,7 +235,7 @@ end # Create a random equation by appending random operators function gen_random_tree( length::Int, options::Options, nfeatures::Int, ::Type{T} -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} # Note that this base tree is just a placeholder; it will be replaced. tree = Node(; val=convert(T, 1)) for i in 1:length @@ -247,7 +247,7 @@ end function gen_random_tree_fixed_size( node_count::Int, options::Options, nfeatures::Int, ::Type{T} -)::Node{T} where {T<:Real} +)::Node{T} where {T<:DATA_TYPE} tree = make_random_leaf(nfeatures, T) cur_size = count_nodes(tree) while cur_size < node_count diff --git a/src/Operators.jl b/src/Operators.jl index 7348fef10..a84d2e3b8 100644 --- a/src/Operators.jl +++ b/src/Operators.jl @@ -3,9 +3,10 @@ module OperatorsModule using SpecialFunctions: SpecialFunctions import SpecialFunctions: erf, erfc import Base: @deprecate +import ..ProgramConstantsModule: DATA_TYPE #TODO - actually add these operators to the module! -function gamma(x::T)::T where {T<:Real} +function gamma(x::T)::T where {T<:DATA_TYPE} out = SpecialFunctions.gamma(x) return isinf(out) ? T(NaN) : out end @@ -20,19 +21,20 @@ atanh_clip(x) = atanh(mod(x + 1, 2) - 1) # Use some fast operators from https://github.com/JuliaLang/julia/blob/81597635c4ad1e8c2e1c5753fda4ec0e7397543f/base/fastmath.jl # Define allowed operators. Any julia operator can also be used. # TODO: Add all of these operators to the precompilation. -function plus(x::T, y::T)::T where {T<:Real} +# TODO: Since simplification is done in DynamicExpressions.jl, are these names correct anymore? +function plus(x::T, y::T)::T where {T<:DATA_TYPE} return x + y #Do not change the name of this operator. end -function sub(x::T, y::T)::T where {T<:Real} +function sub(x::T, y::T)::T where {T<:DATA_TYPE} return x - y #Do not change the name of this operator. end -function mult(x::T, y::T)::T where {T<:Real} +function mult(x::T, y::T)::T where {T<:DATA_TYPE} return x * y #Do not change the name of this operator. end -function square(x::T)::T where {T<:Real} +function square(x::T)::T where {T<:DATA_TYPE} return x * x end -function cube(x::T)::T where {T<:Real} +function cube(x::T)::T where {T<:DATA_TYPE} return x^3 end function safe_pow(x::T, y::T)::T where {T<:AbstractFloat} diff --git a/src/PopMember.jl b/src/PopMember.jl index dc202c170..421f30075 100644 --- a/src/PopMember.jl +++ b/src/PopMember.jl @@ -1,15 +1,15 @@ module PopMemberModule import DynamicExpressions: Node, copy_node -import ..CoreModule: Options, Dataset +import ..CoreModule: Options, Dataset, DATA_TYPE, LOSS_TYPE import ..UtilsModule: get_birth_order import ..LossFunctionsModule: score_func # Define a member of population by equation, score, and age -mutable struct PopMember{T<:Real} +mutable struct PopMember{T<:DATA_TYPE,L<:LOSS_TYPE} tree::Node{T} - score::T # Inludes complexity penalty, normalization - loss::T # Raw loss + score::L # Inludes complexity penalty, normalization + loss::L # Raw loss birth::Int # For recording history: @@ -20,7 +20,7 @@ end generate_reference() = abs(rand(Int)) """ - PopMember(t::Node, score::T, loss::T) + PopMember(t::Node{T}, score::L, loss::L) Create a population member with a birth date at the current time. @@ -31,12 +31,12 @@ Create a population member with a birth date at the current time. - `loss::T`: The raw loss to assign. """ function PopMember( - t::Node{T}, score::T, loss::T; ref::Int=-1, parent::Int=-1, deterministic=false -) where {T<:Real} + t::Node{T}, score::L, loss::L; ref::Int=-1, parent::Int=-1, deterministic=false +) where {T<:DATA_TYPE,L<:LOSS_TYPE} if ref == -1 ref = generate_reference() end - return PopMember{T}( + return PopMember{T,L}( t, score, loss, get_birth_order(; deterministic=deterministic), ref, parent ) end @@ -55,30 +55,30 @@ Automatically compute the score for this tree. - `options::Options`: What options to use. """ function PopMember( - dataset::Dataset{T}, + dataset::Dataset{T,L}, t::Node{T}, options::Options; ref::Int=-1, parent::Int=-1, deterministic=nothing, -) where {T<:Real} +) where {T<:DATA_TYPE,L<:LOSS_TYPE} score, loss = score_func(dataset, t, options) return PopMember(t, score, loss; ref=ref, parent=parent, deterministic=deterministic) end -function copy_pop_member(p::PopMember{T})::PopMember{T} where {T<:Real} +function copy_pop_member( + p::PopMember{T,L} +)::PopMember{T,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} tree = copy_node(p.tree) score = copy(p.score) loss = copy(p.loss) birth = copy(p.birth) ref = copy(p.ref) parent = copy(p.parent) - return PopMember{T}(tree, score, loss, birth, ref, parent) + return PopMember{T,L}(tree, score, loss, birth, ref, parent) end -function copy_pop_member_reset_birth( - p::PopMember{T}; deterministic::Bool -)::PopMember{T} where {T<:Real} +function copy_pop_member_reset_birth(p::P; deterministic::Bool)::P where {P<:PopMember} new_member = copy_pop_member(p) new_member.birth = get_birth_order(; deterministic=deterministic) return new_member diff --git a/src/Population.jl b/src/Population.jl index fcf3dd50c..7bfbed4f6 100644 --- a/src/Population.jl +++ b/src/Population.jl @@ -3,7 +3,7 @@ module PopulationModule using StatsBase: StatsBase import Random: randperm import DynamicExpressions: string_tree -import ..CoreModule: Options, Dataset, RecordType +import ..CoreModule: Options, Dataset, RecordType, DATA_TYPE, LOSS_TYPE import ..ComplexityModule: compute_complexity import ..LossFunctionsModule: score_func, update_baseline_loss! import ..AdaptiveParsimonyModule: RunningSearchStatistics @@ -11,8 +11,8 @@ import ..MutationFunctionsModule: gen_random_tree import ..PopMemberModule: PopMember, copy_pop_member # A list of members of the population, with easy constructors, # which allow for random generation of new populations -mutable struct Population{T<:Real} - members::Array{PopMember{T},1} +mutable struct Population{T<:DATA_TYPE,L<:LOSS_TYPE} + members::Array{PopMember{T,L},1} n::Int end """ @@ -20,7 +20,9 @@ end Create population from list of PopMembers. """ -Population(pop::Array{PopMember{T},1}) where {T<:Real} = Population{T}(pop, size(pop, 1)) +function Population(pop::Array{PopMember{T,L},1}) where {T<:DATA_TYPE,L<:LOSS_TYPE} + return Population{T,L}(pop, size(pop, 1)) +end """ Population(dataset::Dataset{T}; npop::Int, nlength::Int=3, options::Options, @@ -29,9 +31,9 @@ Population(pop::Array{PopMember{T},1}) where {T<:Real} = Population{T}(pop, size Create random population and score them on the dataset. """ function Population( - dataset::Dataset{T}; npop::Int, nlength::Int=3, options::Options, nfeatures::Int -) where {T<:Real} - return Population{T}( + dataset::Dataset{T,L}; npop::Int, nlength::Int=3, options::Options, nfeatures::Int +) where {T<:DATA_TYPE,L<:LOSS_TYPE} + return Population{T,L}( [ PopMember( dataset, @@ -58,18 +60,19 @@ function Population( nlength::Int=3, options::Options, nfeatures::Int, -) where {T<:Real} - dataset = Dataset(X, y) + loss_type::Type=nothing, +) where {T<:DATA_TYPE} + dataset = Dataset(X, y; loss_type=loss_type) update_baseline_loss!(dataset, options) return Population(dataset; npop=npop, options=options, nfeatures=nfeatures) end -function copy_population(pop::Population{T})::Population{T} where {T<:Real} +function copy_population(pop::P)::P where {P<:Population} return Population([copy_pop_member(pm) for pm in pop.members]) end # Sample random members of the population, and make a new one -function sample_pop(pop::Population{T}, options::Options)::Population{T} where {T} +function sample_pop(pop::P, options::Options)::P where {P<:Population} return Population( StatsBase.sample(pop.members, options.tournament_selection_n; replace=false) ) @@ -77,10 +80,10 @@ end # Sample the population, and get the best member from that sample function best_of_sample( - pop::Population{T}, + pop::Population{T,L}, running_search_statistics::RunningSearchStatistics, options::Options{CT}, -)::PopMember where {T<:Real,CT} +)::PopMember{T,L} where {T<:DATA_TYPE,L<:LOSS_TYPE,CT} sample = sample_pop(pop, options) p = options.tournament_selection_p @@ -89,16 +92,16 @@ function best_of_sample( if options.use_frequency_in_tournament # Score based on frequency of that size occuring. # In the end, all sizes should be just as common in the population. - adaptive_parsimony_scaling = T(options.adaptive_parsimony_scaling) + adaptive_parsimony_scaling = L(options.adaptive_parsimony_scaling) # e.g., for 100% occupied at one size, exp(-20*1) = 2.061153622438558e-9; which seems like a good punishment for dominating the population. - scores = Vector{T}(undef, tournament_selection_n) + scores = Vector{L}(undef, tournament_selection_n) for (i, member) in enumerate(sample.members) size = compute_complexity(member.tree, options) frequency = if (0 < size <= options.maxsize) running_search_statistics.normalized_frequencies[size] else - T(0) + L(0) end scores[i] = member.score * exp(adaptive_parsimony_scaling * frequency) end @@ -132,8 +135,8 @@ end end function finalize_scores( - dataset::Dataset{T}, pop::Population, options::Options -)::Tuple{Population,Float64} where {T<:Real} + dataset::Dataset{T,L}, pop::Population{T,L}, options::Options +)::Tuple{Population{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} need_recalculate = options.batching num_evals = 0.0 if need_recalculate @@ -148,12 +151,12 @@ function finalize_scores( end # Return best 10 examples -function best_sub_pop(pop::Population; topn::Int=10)::Population +function best_sub_pop(pop::Population; topn::Int=10) best_idx = sortperm([pop.members[member].score for member in 1:(pop.n)]) return Population(pop.members[best_idx[1:topn]]) end -function record_population(pop::Population{T}, options::Options)::RecordType where {T<:Real} +function record_population(pop::Population, options::Options)::RecordType return RecordType( "population" => [ RecordType( diff --git a/src/ProgramConstants.jl b/src/ProgramConstants.jl index f4ecf39a5..607ce08b2 100644 --- a/src/ProgramConstants.jl +++ b/src/ProgramConstants.jl @@ -5,4 +5,7 @@ const BATCH_DIM = 2 const FEATURE_DIM = 1 const RecordType = Dict{String,Any} +const DATA_TYPE = Number +const LOSS_TYPE = Real + end diff --git a/src/RegularizedEvolution.jl b/src/RegularizedEvolution.jl index c221b144a..d610a7de4 100644 --- a/src/RegularizedEvolution.jl +++ b/src/RegularizedEvolution.jl @@ -2,7 +2,7 @@ module RegularizedEvolutionModule import Random: shuffle! import DynamicExpressions: string_tree -import ..CoreModule: Options, Dataset, RecordType +import ..CoreModule: Options, Dataset, RecordType, DATA_TYPE, LOSS_TYPE import ..PopMemberModule: PopMember import ..PopulationModule: Population, best_of_sample import ..AdaptiveParsimonyModule: RunningSearchStatistics @@ -12,14 +12,14 @@ import ..RecorderModule: @recorder # Pass through the population several times, replacing the oldest # with the fittest of a small subsample function reg_evol_cycle( - dataset::Dataset{T}, - pop::Population, - temperature::T, + dataset::Dataset{T,L}, + pop::Population{T,L}, + temperature, curmaxsize::Int, running_search_statistics::RunningSearchStatistics, options::Options, record::RecordType, -)::Tuple{Population,Float64} where {T<:Real} +)::Tuple{Population{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} # Batch over each subsample. Can give 15% improvement in speed; probably moreso for large pops. # but is ultimately a different algorithm than regularized evolution, and might not be # as good. diff --git a/src/SearchUtils.jl b/src/SearchUtils.jl index 5489e2f9d..b642ade70 100644 --- a/src/SearchUtils.jl +++ b/src/SearchUtils.jl @@ -45,8 +45,8 @@ macro sr_spawner(parallel, p, expr) end function init_dummy_pops( - nout::Int, npops::Int, datasets::Vector{Dataset{T}}, options::Options -)::Vector{Vector{Population{T}}} where {T} + nout::Int, npops::Int, datasets::Vector{Dataset{T,L}}, options::Options +)::Vector{Vector{Population{T,L}}} where {T,L} return [ [ Population( @@ -107,10 +107,10 @@ function check_for_user_quit(reader::StdinReader)::Bool end function check_for_loss_threshold( - datasets::AbstractVector{Dataset{T}}, - hallOfFame::AbstractVector{HallOfFame{T}}, + datasets::AbstractVector{Dataset{T,L}}, + hallOfFame::AbstractVector{HallOfFame{T,L}}, options::Options, -)::Bool where {T} +)::Bool where {T,L} options.early_stop_condition === nothing && return false # Check if all nout are below stopping condition. @@ -214,12 +214,12 @@ end function update_progress_bar!( progress_bar::WrappedProgressBar; - hall_of_fame::HallOfFame{T}, - dataset::Dataset{T}, + hall_of_fame::HallOfFame{T,L}, + dataset::Dataset{T,L}, options::Options, head_node_occupation::Float64, parallelism=:serial, -) where {T} +) where {T,L} equation_strings = string_dominating_pareto_curve(hall_of_fame, dataset, options) # TODO - include command about "q" here. load_string = get_load_string(; head_node_occupation, parallelism) @@ -231,15 +231,15 @@ function update_progress_bar!( end function print_search_state( - hall_of_fames::Vector{HallOfFame{T}}, - datasets::Vector{Dataset{T}}; + hall_of_fames::Vector{HallOfFame{T,L}}, + datasets::Vector{Dataset{T,L}}; options::Options, equation_speed::Vector{Float32}, total_cycles::Int, cycles_remaining::Vector{Int}, head_node_occupation::Float64, parallelism=:serial, -) where {T} +) where {T,L} nout = length(datasets) average_speed = sum(equation_speed) / length(equation_speed) @@ -267,14 +267,16 @@ function print_search_state( @printf("Press 'q' and then to stop execution early.\n") end -const StateType{T} = Tuple{ - Union{Vector{Vector{Population{T}}},Matrix{Population{T}}}, - Union{HallOfFame{T},Vector{HallOfFame{T}}}, +const StateType{T,L} = Tuple{ + Union{Vector{Vector{Population{T,L}}},Matrix{Population{T,L}}}, + Union{HallOfFame{T,L},Vector{HallOfFame{T,L}}}, } -function load_saved_hall_of_fame(saved_state::StateType{T})::Vector{HallOfFame{T}} where {T} +function load_saved_hall_of_fame( + saved_state::StateType{T,L} +)::Vector{HallOfFame{T,L}} where {T,L} hall_of_fame = saved_state[2] - if !isa(hall_of_fame, Vector{HallOfFame{T}}) + if !isa(hall_of_fame, Vector{HallOfFame{T,L}}) hall_of_fame = [hall_of_fame] end return [copy_hall_of_fame(hof) for hof in hall_of_fame] @@ -282,18 +284,18 @@ end load_saved_hall_of_fame(::Nothing)::Nothing = nothing function get_population( - pops::Vector{Vector{Population{T}}}; out::Int, pop::Int -)::Population{T} where {T} + pops::Vector{Vector{Population{T,L}}}; out::Int, pop::Int +)::Population{T,L} where {T,L} return pops[out][pop] end function get_population( - pops::Matrix{Population{T}}; out::Int, pop::Int -)::Population{T} where {T} + pops::Matrix{Population{T,L}}; out::Int, pop::Int +)::Population{T,L} where {T,L} return pops[out, pop] end function load_saved_population( - saved_state::StateType{T}; out::Int, pop::Int -)::Population{T} where {T} + saved_state::StateType{T,L}; out::Int, pop::Int +)::Population{T,L} where {T,L} saved_pop = get_population(saved_state[1]; out=out, pop=pop) return copy_population(saved_pop) end diff --git a/src/SingleIteration.jl b/src/SingleIteration.jl index c7a962dd9..95f8c5ef1 100644 --- a/src/SingleIteration.jl +++ b/src/SingleIteration.jl @@ -1,7 +1,7 @@ module SingleIterationModule import DynamicExpressions: string_tree, simplify_tree, combine_operators -import ..CoreModule: Options, Dataset, RecordType +import ..CoreModule: Options, Dataset, RecordType, DATA_TYPE, LOSS_TYPE import ..ComplexityModule: compute_complexity import ..UtilsModule: debug import ..PopMemberModule: copy_pop_member, generate_reference @@ -15,22 +15,22 @@ import ..RecorderModule: @recorder # Cycle through regularized evolution many times, # printing the fittest equation every 10% through function s_r_cycle( - dataset::Dataset{T}, - pop::Population, + dataset::Dataset{T,L}, + pop::Population{T,L}, ncycles::Int, curmaxsize::Int, running_search_statistics::RunningSearchStatistics; verbosity::Int=0, options::Options, record::RecordType, -)::Tuple{Population{T},HallOfFame{T},Float64} where {T<:Real} - max_temp = T(1.0) - min_temp = T(0.0) +)::Tuple{Population{T,L},HallOfFame{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} + max_temp = L(1.0) + min_temp = L(0.0) if !options.annealing min_temp = max_temp end all_temperatures = LinRange(max_temp, min_temp, ncycles) - best_examples_seen = HallOfFame(options, T) + best_examples_seen = HallOfFame(options, T, L) num_evals = 0.0 for temperature in all_temperatures @@ -61,12 +61,12 @@ function s_r_cycle( end function optimize_and_simplify_population( - dataset::Dataset{T}, - pop::Population, + dataset::Dataset{T,L}, + pop::Population{T,L}, options::Options, curmaxsize::Int, record::RecordType, -)::Tuple{Population,Float64} where {T<:Real} +)::Tuple{Population{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} array_num_evals = zeros(Float64, pop.n) do_optimization = rand(pop.n) .< options.optimizer_probability for j in 1:(pop.n) diff --git a/src/SymbolicRegression.jl b/src/SymbolicRegression.jl index b7a7c9bc9..9d00b7fbc 100644 --- a/src/SymbolicRegression.jl +++ b/src/SymbolicRegression.jl @@ -142,6 +142,8 @@ import .CoreModule: MAX_DEGREE, BATCH_DIM, FEATURE_DIM, + DATA_TYPE, + LOSS_TYPE, RecordType, Dataset, Options, @@ -292,9 +294,10 @@ function EquationSearch( procs::Union{Vector{Int},Nothing}=nothing, addprocs_function::Union{Function,Nothing}=nothing, runtests::Bool=true, - saved_state::Union{StateType{T},Nothing}=nothing, + saved_state::Union{StateType{T,L},Nothing}=nothing, multithreaded=nothing, -) where {T<:Real} + loss_type::Type=Nothing, +) where {T<:DATA_TYPE,L<:LOSS_TYPE} if multithreaded !== nothing error( "`multithreaded` is deprecated. Use the `parallelism` argument instead. " * @@ -311,6 +314,7 @@ function EquationSearch( y[j, :]; weights=(weights === nothing ? weights : weights[j, :]), varMap=varMap, + loss_type=loss_type, ) for j in 1:nout ] @@ -329,7 +333,7 @@ end function EquationSearch( X::AbstractMatrix{T1}, y::AbstractMatrix{T2}; kw... -) where {T1<:Real,T2<:Real} +) where {T1<:DATA_TYPE,T2<:DATA_TYPE} U = promote_type(T1, T2) return EquationSearch( convert(AbstractMatrix{U}, X), convert(AbstractMatrix{U}, y); kw... @@ -338,12 +342,12 @@ end function EquationSearch( X::AbstractMatrix{T1}, y::AbstractVector{T2}; kw... -) where {T1<:Real,T2<:Real} +) where {T1<:DATA_TYPE,T2<:DATA_TYPE} return EquationSearch(X, reshape(y, (1, size(y, 1))); kw...) end function EquationSearch( - datasets::Vector{Dataset{T}}; + datasets::Vector{Dataset{T,L}}; niterations::Int=10, options::Options=Options(), parallelism=:multithreading, @@ -351,8 +355,8 @@ function EquationSearch( procs::Union{Vector{Int},Nothing}=nothing, addprocs_function::Union{Function,Nothing}=nothing, runtests::Bool=true, - saved_state::Union{StateType{T},Nothing}=nothing, -) where {T<:Real} + saved_state::Union{StateType{T,L},Nothing}=nothing, +) where {T<:DATA_TYPE,L<:LOSS_TYPE} concurrency = if parallelism in (:multithreading, "multithreading") :multithreading elseif parallelism in (:multiprocessing, "multiprocessing") @@ -392,15 +396,15 @@ end function _EquationSearch( parallelism::Symbol, - datasets::Vector{Dataset{T}}; + datasets::Vector{Dataset{T,L}}; niterations::Int, options::Options, numprocs::Union{Int,Nothing}, procs::Union{Vector{Int},Nothing}, addprocs_function::Union{Function,Nothing}, runtests::Bool, - saved_state::Union{StateType{T},Nothing}, -) where {T<:Real} + saved_state::Union{StateType{T,L},Nothing}, +) where {T<:DATA_TYPE,L<:LOSS_TYPE} if options.deterministic if parallelism != :serial error("Determinism is only guaranteed for serial mode.") @@ -531,11 +535,11 @@ function _EquationSearch( hallOfFame = load_saved_hall_of_fame(saved_state) if hallOfFame === nothing - hallOfFame = [HallOfFame(options, T) for j in 1:nout] + hallOfFame = [HallOfFame(options, T, L) for j in 1:nout] else # Recompute losses for the hall of fame, in # case the dataset changed: - hallOfFame::Vector{HallOfFame{T}} + hallOfFame::Vector{HallOfFame{T,L}} for (hof, dataset) in zip(hallOfFame, datasets) for member in hof.members[hof.exists] score, result_loss = score_func(dataset, member.tree, options) @@ -545,7 +549,7 @@ function _EquationSearch( end end @assert length(hallOfFame) == nout - hallOfFame::Vector{HallOfFame{T}} + hallOfFame::Vector{HallOfFame{T,L}} for j in 1:nout for i in 1:(options.npopulations) @@ -557,7 +561,7 @@ function _EquationSearch( saved_pop = load_saved_population(saved_state; out=j, pop=i) if saved_pop !== nothing && length(saved_pop.members) == options.npop - saved_pop::Population{T} + saved_pop::Population{T,L} ## Update losses: for member in saved_pop.members score, result_loss = score_func(datasets[j], member.tree, options) @@ -565,7 +569,7 @@ function _EquationSearch( member.loss = result_loss end new_pop = @sr_spawner parallelism worker_idx ( - saved_pop, HallOfFame(options, T), RecordType(), 0.0 + saved_pop, HallOfFame(options, T, L), RecordType(), 0.0 ) else if saved_pop !== nothing @@ -579,7 +583,7 @@ function _EquationSearch( options=options, nfeatures=datasets[j].nfeatures, ), - HallOfFame(options, T), + HallOfFame(options, T, L), RecordType(), Float64(options.npop), ) @@ -949,7 +953,7 @@ function _EquationSearch( if options.return_state state = (returnPops, (nout == 1 ? only(hallOfFame) : hallOfFame)) - state::StateType{T} + state::StateType{T,L} return state else if nout == 1 From bcc141330f24feb96e4a0405deb960cd46bdbf6c Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 16 Mar 2023 22:21:45 -0400 Subject: [PATCH 03/19] Fix issue in population init --- src/LossFunctions.jl | 4 ++-- src/Population.jl | 10 +++++++--- test/test_prob_pick_first.jl | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/LossFunctions.jl b/src/LossFunctions.jl index 19b8aab70..98f64579a 100644 --- a/src/LossFunctions.jl +++ b/src/LossFunctions.jl @@ -110,11 +110,11 @@ function score_func_batch( end if !dataset.weighted - result_loss = _loss(prediction, batch_y, options.elementwise_loss) + result_loss = L(_loss(prediction, batch_y, options.elementwise_loss)) else w = dataset.weights::AbstractVector{T} batch_w = view(w, batch_idx) - result_loss = _weighted_loss(prediction, batch_y, batch_w, options.elementwise_loss) + result_loss = L(_weighted_loss(prediction, batch_y, batch_w, options.elementwise_loss)) end score = loss_to_score(result_loss, dataset.baseline_loss, tree, options) return score, result_loss diff --git a/src/Population.jl b/src/Population.jl index 7bfbed4f6..0eeb8fb36 100644 --- a/src/Population.jl +++ b/src/Population.jl @@ -20,9 +20,12 @@ end Create population from list of PopMembers. """ -function Population(pop::Array{PopMember{T,L},1}) where {T<:DATA_TYPE,L<:LOSS_TYPE} +function Population( + pop::AP +) where {T<:DATA_TYPE,L<:LOSS_TYPE,AP<:AbstractArray{PopMember{T,L},1}} return Population{T,L}(pop, size(pop, 1)) end + """ Population(dataset::Dataset{T}; npop::Int, nlength::Int=3, options::Options, @@ -49,7 +52,8 @@ end """ Population(X::AbstractMatrix{T}, y::AbstractVector{T}; npop::Int, nlength::Int=3, - options::Options, nfeatures::Int) + options::Options, nfeatures::Int, + loss_type::Type=Nothing) Create random population and score them on the dataset. """ @@ -60,7 +64,7 @@ function Population( nlength::Int=3, options::Options, nfeatures::Int, - loss_type::Type=nothing, + loss_type::Type=Nothing, ) where {T<:DATA_TYPE} dataset = Dataset(X, y; loss_type=loss_type) update_baseline_loss!(dataset, options) diff --git a/test/test_prob_pick_first.jl b/test/test_prob_pick_first.jl index 633c4df25..7d1adc69e 100644 --- a/test/test_prob_pick_first.jl +++ b/test/test_prob_pick_first.jl @@ -14,7 +14,7 @@ options = Options(; ) for reverse in [false, true] - members = PopMember{Float32}[] + members = PopMember{Float32,Float32}[] # Generate members with scores from 0 to 1: for i in 1:n @@ -27,7 +27,7 @@ for reverse in [false, true] push!(members, PopMember(tree, score, test_loss)) end - pop = Population(members, n) + pop = Population(members) dummy_running_stats = SymbolicRegression.AdaptiveParsimonyModule.RunningSearchStatistics(; options=options From 7f5314691e1623abba93d9cfa7be74f614b2446e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Thu, 16 Mar 2023 22:54:06 -0400 Subject: [PATCH 04/19] Introduce multi-line breaks in equation printing --- src/HallOfFame.jl | 29 ++++++++++++++++++++++++++--- src/SearchUtils.jl | 16 +++++++++++----- src/SymbolicRegression.jl | 1 + 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/HallOfFame.jl b/src/HallOfFame.jl index 9922fc4a7..ffa4c6ca4 100644 --- a/src/HallOfFame.jl +++ b/src/HallOfFame.jl @@ -113,13 +113,15 @@ function calculate_pareto_frontier( ) end -function string_dominating_pareto_curve(hallOfFame, dataset, options) +function string_dominating_pareto_curve(hallOfFame, dataset, options; width::Union{Integer,Nothing}=nothing) + twidth = (width === nothing) ? 100 : max(100, width::Integer) output = "" curMSE = Float64(dataset.baseline_loss) lastMSE = curMSE lastComplexity = 0 output *= "Hall of Fame:\n" - output *= "-----------------------------------------\n" + # TODO: Get user's terminal width. + output *= "-"^twidth * "\n" output *= @sprintf( "%-10s %-8s %-8s %-8s\n", "Complexity", "Loss", "Score", "Equation" ) @@ -141,16 +143,37 @@ function string_dominating_pareto_curve(hallOfFame, dataset, options) ZERO_POINT = 1e-10 delta_l_mse = log(abs(curMSE / lastMSE) + ZERO_POINT) score = convert(Float32, -delta_l_mse / delta_c) + eqn_string = string_tree(member.tree, options.operators; varMap=dataset.varMap) + split_at = twidth - 40 output *= @sprintf( "%-10d %-8.3e %-8.3e %-s\n", complexity, curMSE, score, - string_tree(member.tree, options.operators, varMap=dataset.varMap) + length(eqn_string) < split_at ? eqn_string : eqn_string[1:split_at] * "..." ) + hit = false + while length(eqn_string) > split_at + eqn_string = eqn_string[(split_at + 1 + (hit ? 3 : 0)):end] + output *= @sprintf( + "%-10s %-10s %-10s %-s\n", + "", + "", + "", + ( + if length(eqn_string) < split_at + eqn_string + else + eqn_string[1:(split_at - 3)] * "..." + end + ) + ) + hit = true + end lastMSE = curMSE lastComplexity = complexity end + output *= "-"^twidth * "\n" output *= "\n" return output end diff --git a/src/SearchUtils.jl b/src/SearchUtils.jl index b642ade70..28ab73018 100644 --- a/src/SearchUtils.jl +++ b/src/SearchUtils.jl @@ -220,7 +220,9 @@ function update_progress_bar!( head_node_occupation::Float64, parallelism=:serial, ) where {T,L} - equation_strings = string_dominating_pareto_curve(hall_of_fame, dataset, options) + equation_strings = string_dominating_pareto_curve( + hall_of_fame, dataset, options; width=progress_bar.bar.width + ) # TODO - include command about "q" here. load_string = get_load_string(; head_node_occupation, parallelism) load_string *= @sprintf("Press 'q' and then to stop execution early.\n") @@ -239,7 +241,9 @@ function print_search_state( cycles_remaining::Vector{Int}, head_node_occupation::Float64, parallelism=:serial, + width::Union{Integer,Nothing}=nothing, ) where {T,L} + twidth = (width === nothing) ? 100 : max(100, width::Integer) nout = length(datasets) average_speed = sum(equation_speed) / length(equation_speed) @@ -255,16 +259,18 @@ function print_search_state( 100.0 * cycles_elapsed / total_cycles / nout ) - @printf("==============================\n") + print("="^twidth * "\n") for (j, (hall_of_fame, dataset)) in enumerate(zip(hall_of_fames, datasets)) if nout > 1 @printf("Best equations for output %d\n", j) end - equation_strings = string_dominating_pareto_curve(hall_of_fame, dataset, options) + equation_strings = string_dominating_pareto_curve( + hall_of_fame, dataset, options; width=width + ) print(equation_strings) - @printf("==============================\n") + print("="^twidth * "\n") end - @printf("Press 'q' and then to stop execution early.\n") + return print("Press 'q' and then to stop execution early.\n") end const StateType{T,L} = Tuple{ diff --git a/src/SymbolicRegression.jl b/src/SymbolicRegression.jl index 9d00b7fbc..6c8adee44 100644 --- a/src/SymbolicRegression.jl +++ b/src/SymbolicRegression.jl @@ -910,6 +910,7 @@ function _EquationSearch( cycles_remaining, head_node_occupation, parallelism, + width=options.terminal_width, ) end last_print_time = time() From 8502faf7c491ebdbec8fe172b91fb48c6a2c1c35 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 17 Mar 2023 00:35:05 -0400 Subject: [PATCH 05/19] Fix constant optimization for complex numbers --- src/ConstantOptimization.jl | 5 ++++- src/LossFunctions.jl | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/ConstantOptimization.jl b/src/ConstantOptimization.jl index 6f1a96c5e..a293b2b76 100644 --- a/src/ConstantOptimization.jl +++ b/src/ConstantOptimization.jl @@ -29,7 +29,10 @@ function optimize_constants( end x0 = get_constants(member.tree) f(x::Vector{T})::L = opt_func(x, dataset, member.tree, options) - if nconst == 1 + if T <: Complex + # TODO: Make this more general. Also, do we even need Newton here at all?? + algorithm = Optim.BFGS(; linesearch=LineSearches.BackTracking())#order=3)) + elseif nconst == 1 algorithm = Optim.Newton(; linesearch=LineSearches.BackTracking()) else if options.optimizer_algorithm == "NelderMead" diff --git a/src/LossFunctions.jl b/src/LossFunctions.jl index 98f64579a..bed2f27fa 100644 --- a/src/LossFunctions.jl +++ b/src/LossFunctions.jl @@ -114,7 +114,9 @@ function score_func_batch( else w = dataset.weights::AbstractVector{T} batch_w = view(w, batch_idx) - result_loss = L(_weighted_loss(prediction, batch_y, batch_w, options.elementwise_loss)) + result_loss = L( + _weighted_loss(prediction, batch_y, batch_w, options.elementwise_loss) + ) end score = loss_to_score(result_loss, dataset.baseline_loss, tree, options) return score, result_loss From 9d62798dbab9b81a5f52be9aa2ed7140eed68492 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 17 Mar 2023 13:46:26 -0400 Subject: [PATCH 06/19] Add integration tests for complex types --- test/full.jl | 4 ++++ test/test_abstract_numbers.jl | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 test/test_abstract_numbers.jl diff --git a/test/full.jl b/test/full.jl index 380507597..631b4fe12 100644 --- a/test/full.jl +++ b/test/full.jl @@ -43,3 +43,7 @@ end @testset "Test whether custom objectives work." begin include("test_custom_objectives.jl") end + +@testset "Test abstract numbers" begin + include("test_abstract_numbers.jl") +end diff --git a/test/test_abstract_numbers.jl b/test/test_abstract_numbers.jl new file mode 100644 index 000000000..d0c8c2a3e --- /dev/null +++ b/test/test_abstract_numbers.jl @@ -0,0 +1,31 @@ +using SymbolicRegression +using Test +using Random + +get_base_type(::Type{<:Complex{BT}}) where {BT} = BT + +for T in (ComplexF16, ComplexF32, ComplexF64) + L = get_base_type(T) + @testset "Test search with $T type" begin + X = randn(MersenneTwister(0), T, 1, 100) + y = @. (2 - 0.5im) * cos((1 + 1im) * X[1, :]) |> T + + early_stop(loss::L, c) where {L} = ((loss <= L(1e-2)) && (c <= 15)) + + dataset = Dataset(X, y; loss_type=L) + + options = SymbolicRegression.Options(; + binary_operators=[+, *, -, /], + unary_operators=[cos], + npopulations=20, + early_stop_condition=early_stop, + elementwise_loss=(prediction, target) -> abs2(prediction - target), + ) + + hof = EquationSearch([dataset]; options=options, niterations=1_000_000_000) + dominating = calculate_pareto_frontier(dataset, hof, options) + output, _ = eval_tree_array(dominating[end].tree, X, options) + @test typeof(output) <: AbstractArray{T} + @test sum(abs2, output .- y) / length(output) <= L(1e-2) + end +end From 015e36ad4b3c421abf3b82144c0b5b701164f64f Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 17 Mar 2023 14:30:47 -0400 Subject: [PATCH 07/19] Automatically get float version of loss --- src/SymbolicRegression.jl | 4 ++++ test/test_abstract_numbers.jl | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/SymbolicRegression.jl b/src/SymbolicRegression.jl index 6c8adee44..9509bb791 100644 --- a/src/SymbolicRegression.jl +++ b/src/SymbolicRegression.jl @@ -308,6 +308,10 @@ function EquationSearch( if weights !== nothing weights = reshape(weights, size(y)) end + if T <: Complex && loss_type == Nothing + get_base_type(::Type{Complex{BT}}) where {BT} = BT + loss_type = get_base_type(T) + end datasets = [ Dataset( X, diff --git a/test/test_abstract_numbers.jl b/test/test_abstract_numbers.jl index d0c8c2a3e..7e4b8c092 100644 --- a/test/test_abstract_numbers.jl +++ b/test/test_abstract_numbers.jl @@ -12,8 +12,6 @@ for T in (ComplexF16, ComplexF32, ComplexF64) early_stop(loss::L, c) where {L} = ((loss <= L(1e-2)) && (c <= 15)) - dataset = Dataset(X, y; loss_type=L) - options = SymbolicRegression.Options(; binary_operators=[+, *, -, /], unary_operators=[cos], @@ -22,8 +20,16 @@ for T in (ComplexF16, ComplexF32, ComplexF64) elementwise_loss=(prediction, target) -> abs2(prediction - target), ) - hof = EquationSearch([dataset]; options=options, niterations=1_000_000_000) + dataset = Dataset(X, y; loss_type=L) + hof = if T == ComplexF16 + EquationSearch([dataset]; options=options, niterations=1_000_000_000) + else + # Should automatically find correct type: + EquationSearch(X, y; options=options, niterations=1_000_000_000) + end + dominating = calculate_pareto_frontier(dataset, hof, options) + @test typeof(dominating[end].loss) == L output, _ = eval_tree_array(dominating[end].tree, X, options) @test typeof(output) <: AbstractArray{T} @test sum(abs2, output .- y) / length(output) <= L(1e-2) From 955c6759a7ed01daf8c59cb113de661526c50891 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 17 Mar 2023 15:59:07 -0400 Subject: [PATCH 08/19] Correct api call in losses --- docs/src/losses.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/losses.md b/docs/src/losses.md index b9bf60c7f..77dcfc6b2 100644 --- a/docs/src/losses.md +++ b/docs/src/losses.md @@ -10,7 +10,7 @@ You can also declare your own loss as a function that takes two (unweighted) or three (weighted) scalar arguments. For example, ``` f(x, y, w) = abs(x-y)*w -options = Options(loss=f) +options = Options(elementwise_loss=f) ``` ## Regression: From 34df75d8cf64b7bee7d30fdcdf7429a0640d7175 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Fri, 17 Mar 2023 16:28:52 -0400 Subject: [PATCH 09/19] Make equation wrap at terminal width --- docs/src/api.md | 2 +- src/HallOfFame.jl | 66 ++++++++++++++++++++++----------------- src/SymbolicRegression.jl | 2 ++ 3 files changed, 40 insertions(+), 30 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 480317a3a..c42cabd9f 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -62,6 +62,6 @@ node_to_symbolic(tree::Node, options::Options; calculate_pareto_frontier(X::AbstractMatrix{T}, y::AbstractVector{T}, hallOfFame::HallOfFame{T,L}, options::Options; weights=nothing, varMap=nothing) where {T<:DATA_TYPE,L<:LOSS_TYPE} -calculate_pareto_frontier(dataset::Dataset{T}, hallOfFame::HallOfFame{T}, +calculate_pareto_frontier(dataset::Dataset{T,L}, hallOfFame::HallOfFame{T,L}, options::Options) where {T<:DATA_TYPE,L<:LOSS_TYPE} ``` diff --git a/src/HallOfFame.jl b/src/HallOfFame.jl index ffa4c6ca4..e33284c0f 100644 --- a/src/HallOfFame.jl +++ b/src/HallOfFame.jl @@ -113,7 +113,9 @@ function calculate_pareto_frontier( ) end -function string_dominating_pareto_curve(hallOfFame, dataset, options; width::Union{Integer,Nothing}=nothing) +function string_dominating_pareto_curve( + hallOfFame, dataset, options; width::Union{Integer,Nothing}=nothing +) twidth = (width === nothing) ? 100 : max(100, width::Integer) output = "" curMSE = Float64(dataset.baseline_loss) @@ -121,7 +123,7 @@ function string_dominating_pareto_curve(hallOfFame, dataset, options; width::Uni lastComplexity = 0 output *= "Hall of Fame:\n" # TODO: Get user's terminal width. - output *= "-"^twidth * "\n" + output *= "-"^(twidth - 1) * "\n" output *= @sprintf( "%-10s %-8s %-8s %-8s\n", "Complexity", "Loss", "Score", "Equation" ) @@ -144,38 +146,44 @@ function string_dominating_pareto_curve(hallOfFame, dataset, options; width::Uni delta_l_mse = log(abs(curMSE / lastMSE) + ZERO_POINT) score = convert(Float32, -delta_l_mse / delta_c) eqn_string = string_tree(member.tree, options.operators; varMap=dataset.varMap) - split_at = twidth - 40 - output *= @sprintf( - "%-10d %-8.3e %-8.3e %-s\n", - complexity, - curMSE, - score, - length(eqn_string) < split_at ? eqn_string : eqn_string[1:split_at] * "..." - ) - hit = false - while length(eqn_string) > split_at - eqn_string = eqn_string[(split_at + 1 + (hit ? 3 : 0)):end] - output *= @sprintf( - "%-10s %-10s %-10s %-s\n", - "", - "", - "", - ( - if length(eqn_string) < split_at - eqn_string - else - eqn_string[1:(split_at - 3)] * "..." - end - ) - ) - hit = true + base_string_length = length(@sprintf("%-10d %-8.3e %8.3e ", 1, 1.0, 1.0)) + + dots = "..." + equation_width = (twidth - 1) - base_string_length - length(dots) + + output *= @sprintf("%-10d %-8.3e %-8.3e ", complexity, curMSE, score,) + + split_eqn = split_string(eqn_string, equation_width) + print_pad = false + while length(split_eqn) > 1 + cur_piece = popfirst!(split_eqn) + output *= " "^(print_pad * base_string_length) * cur_piece * dots * "\n" + print_pad = true end + output *= " "^(print_pad * base_string_length) * split_eqn[1] * "\n" + lastMSE = curMSE lastComplexity = complexity end - output *= "-"^twidth * "\n" - output *= "\n" + output *= "-"^(twidth - 1) * "\n" + output *= "\n"^2 return output end +""" + split_string(s::String, n::Integer) + +```jldoctest +split_string("abcdefgh", 3) + +# output + +["abc", "def", "gh"] +``` +""" +function split_string(s::String, n::Integer) + length(s) <= n && return [s] + return [s[i:min(i + n - 1, end)] for i in 1:n:length(s)] +end + end diff --git a/src/SymbolicRegression.jl b/src/SymbolicRegression.jl index 9509bb791..2cbce2443 100644 --- a/src/SymbolicRegression.jl +++ b/src/SymbolicRegression.jl @@ -8,6 +8,8 @@ export Population, Dataset, MutationWeights, Node, + LOSS_TYPE, + DATA_TYPE, #Functions: EquationSearch, From 79e6ad5c3bac60b6dbe0ea4120438a3c3fc3ef5e Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 16:16:55 -0400 Subject: [PATCH 10/19] Fix normalization with infinite baseline loss --- src/Dataset.jl | 4 +++- src/LossFunctions.jl | 26 +++++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/Dataset.jl b/src/Dataset.jl index 750b2db83..8c517dc13 100644 --- a/src/Dataset.jl +++ b/src/Dataset.jl @@ -29,6 +29,7 @@ mutable struct Dataset{T<:DATA_TYPE,L<:LOSS_TYPE} weighted::Bool weights::Union{AbstractVector{T},Nothing} avg_y::T + use_baseline::Bool baseline_loss::L varMap::Array{String,1} end @@ -61,10 +62,11 @@ function Dataset( sum(y) / n end loss_type = (loss_type == Nothing) ? T : loss_type + use_baseline = true baseline = one(loss_type) return Dataset{T,loss_type}( - X, y, n, nfeatures, weighted, weights, avg_y, baseline, varMap + X, y, n, nfeatures, weighted, weights, avg_y, use_baseline, baseline, varMap ) end diff --git a/src/LossFunctions.jl b/src/LossFunctions.jl index bed2f27fa..7342f7c48 100644 --- a/src/LossFunctions.jl +++ b/src/LossFunctions.jl @@ -74,12 +74,13 @@ end # Compute a score which includes a complexity penalty in the loss function loss_to_score( - loss::L, baseline::L, tree::Node{T}, options::Options + loss::L, use_baseline::Bool, baseline::L, tree::Node{T}, options::Options )::L where {T<:DATA_TYPE,L<:LOSS_TYPE} - normalization = if baseline < L(0.01) - L(0.01) - else + # TODO: Come up with a more general normalization scheme. + normalization = if baseline >= L(0.01) && use_baseline baseline + else + L(0.01) end normalized_loss_term = loss / normalization size = compute_complexity(tree, options) @@ -93,7 +94,9 @@ function score_func( dataset::Dataset{T,L}, tree::Node{T}, options::Options )::Tuple{L,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} result_loss = eval_loss(tree, dataset, options) - score = loss_to_score(result_loss, dataset.baseline_loss, tree, options) + score = loss_to_score( + result_loss, dataset.use_baseline, dataset.baseline_loss, tree, options + ) return score, result_loss end @@ -118,7 +121,9 @@ function score_func_batch( _weighted_loss(prediction, batch_y, batch_w, options.elementwise_loss) ) end - score = loss_to_score(result_loss, dataset.baseline_loss, tree, options) + score = loss_to_score( + result_loss, dataset.use_baseline, dataset.baseline_loss, tree, options + ) return score, result_loss end @@ -131,7 +136,14 @@ function update_baseline_loss!( dataset::Dataset{T,L}, options::Options ) where {T<:DATA_TYPE,L<:LOSS_TYPE} example_tree = Node(T; val=dataset.avg_y) - dataset.baseline_loss = eval_loss(example_tree, dataset, options) + baseline_loss = eval_loss(example_tree, dataset, options) + if isfinite(baseline_loss) + dataset.baseline_loss = baseline_loss + dataset.use_baseline = true + else + dataset.baseline_loss = one(L) + dataset.use_baseline = false + end return nothing end From c939ddce6e542b6ed0321c95078d14c682e7f45d Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 18:02:44 -0400 Subject: [PATCH 11/19] Bump backend version with fixed complex printing --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6c5da5ba1..87aea6c72 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [compat] -DynamicExpressions = "0.4.3" +DynamicExpressions = "0.5" JSON3 = "1" LineSearches = "7" LossFunctions = "0.6, 0.7, 0.8" From b002bb2639da94b0edfd55e4b221930c5e971979 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 20:55:14 -0400 Subject: [PATCH 12/19] Add documentation for `loss_type` --- src/Dataset.jl | 2 ++ src/SymbolicRegression.jl | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/Dataset.jl b/src/Dataset.jl index 8c517dc13..7fb9dc8de 100644 --- a/src/Dataset.jl +++ b/src/Dataset.jl @@ -15,6 +15,8 @@ import ..ProgramConstantsModule: BATCH_DIM, FEATURE_DIM, DATA_TYPE, LOSS_TYPE - `weights::Union{AbstractVector{T},Nothing}`: If the dataset is weighted, these specify the per-sample weight (with shape `(n,)`). - `avg_y`: The average value of `y` (weighted, if `weights` are passed). +- `use_baseline`: Whether to use a baseline loss. This will be set to `false` + if the baseline loss is calculated to be `Inf`. - `baseline_loss`: The loss of a constant function which predicts the average value of `y`. This is loss-dependent and should be updated with `update_baseline_loss!`. diff --git a/src/SymbolicRegression.jl b/src/SymbolicRegression.jl index 2cbce2443..7538aa3f8 100644 --- a/src/SymbolicRegression.jl +++ b/src/SymbolicRegression.jl @@ -276,6 +276,10 @@ which is useful for debugging and profiling. which will cause `EquationSearch` to return the state. Note that you cannot change the operators or dataset, but most other options should be changeable. +- `loss_type::Type=Nothing`: If you would like to use a different type + for the loss than for the data you passed, specify the type here. + Note that if you pass complex data `::Complex{L}`, then the loss + type will automatically be set to `L`. # Returns - `hallOfFame::HallOfFame`: The best equations seen during the search. From 42783824d16959265eb4804b19fecfd4cdc922d8 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 20:57:01 -0400 Subject: [PATCH 13/19] Update version with abstract numbers implemented --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 87aea6c72..05210c42a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "SymbolicRegression" uuid = "8254be44-1295-4e6a-a16d-46603ac705cb" authors = ["MilesCranmer "] -version = "0.15.3" +version = "0.16.0" [deps] Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" From c41ad5eb99b9046498e29cd3db5f59c1d9a245c5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 21:44:17 -0400 Subject: [PATCH 14/19] Update docs with new parametric types --- docs/src/api.md | 2 +- docs/src/examples.md | 157 +++++++++++++++++++++++++++++++++++++++++++ src/HallOfFame.jl | 17 ++++- src/PopMember.jl | 16 +++-- src/Population.jl | 4 +- 5 files changed, 183 insertions(+), 13 deletions(-) create mode 100644 docs/src/examples.md diff --git a/docs/src/api.md b/docs/src/api.md index c42cabd9f..cd0ce638a 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -12,7 +12,7 @@ EquationSearch(X::AbstractMatrix{T}, y::AbstractMatrix{T}; procs::Union{Array{Int, 1}, Nothing}=nothing, runtests::Bool=true, loss_type::Type=Nothing, -) where {T<:Number} +) where {T<:DATA_TYPE} ``` ## Options diff --git a/docs/src/examples.md b/docs/src/examples.md new file mode 100644 index 000000000..6698e3de7 --- /dev/null +++ b/docs/src/examples.md @@ -0,0 +1,157 @@ +# Toy Examples with Code + +## Preamble + +```julia +using SymbolicRegression +using DataFrames +``` + +We'll also code up a simple function to print a single expression: + +```julia +function get_best(; X, y, hof::HallOfFame{T,L}, options) where {T,L} + dominating = calculate_pareto_frontier(X, y, hof, options) + + df = DataFrame(; + tree=[m.tree for m in dominating], + loss=[m.loss for m in dominating], + complexity=[compute_complexity(m.tree, options) for m in dominating], + ) + + df[!, :score] = vcat( + [L(0.0)], + -1 .* log.(df.loss[2:end] ./ df.loss[1:(end - 1)]) ./ + (df.complexity[2:end] .- df.complexity[1:(end - 1)]), + ) + + min_loss = min(df.loss...) + + best_idx = argmax(df.score .* (df.loss .<= (2 * min_loss))) + + return df.tree[best_idx], df +end +``` + +## 1. Simple search + +Here's a simple example where we +find the expression `2 cos(x3) + x0^2 - 2`. + +```julia +X = 2randn(5, 1000) +y = @. 2*cos(X[4, :]) + X[1, :]^2 - 2 + +options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos]) +hof = EquationSearch(X, y; options=options, niterations=30) +``` + +Let's look at the most accurate expression: + +```julia +best, df = get_best(; X, y, hof, options) +println(best) +``` + +## 2. Custom operator + +Here, we define a custom operator and use it to find an expression: + +```julia +X = 2randn(5, 1000) +y = @. 1/X[1, :] + +options = Options(; binary_operators=[+, *], unary_operators=[inv]) +hof = EquationSearch(X, y; options=options) +println(get_best(; X, y, hof, options)[1]) +``` + +## 3. Multiple outputs + +Here, we do the same thing, but with multiple expressions at once, +each requiring a different feature. + +```julia +X = 2rand(5, 1000) .+ 0.1 +y = @. 1/X[1:3, :] +options = Options(; binary_operators=[+, *], unary_operators=[inv]) +hofs = EquationSearch(X, y; options=options) +bests = [get_best(; X, y=y[i, :], hof=hofs[i], options)[1] for i=1:3] +println(bests) +``` + +## 4. Plotting an expression + +For now, let's consider the expressions for output 1. +We can see the SymbolicUtils version with: + +```julia +eqn = node_to_symbolic(bests[1], options) +``` + +We can get the LaTeX version with: + +```julia +using Latexify +latexify(string(eqn)) +``` + +Let's plot the prediction against the truth: + +```julia +using Plots + +scatter(y[1, :], bests[1](X), xlabel="Truth", ylabel="Prediction") +``` + +Here, we have used the convenience function `(::Node{T})(X)` to evaluate +the expression. However, this will only work because calling `Options()` +will automatically set up calls to `eval_tree_array`. In practice, you should +use the `eval_tree_array` function directly, which is the form of: + +```julia +eval_tree_array(bests[1], X, options) +``` + +## 5. Other types + +SymbolicRegression.jl can handle most numeric types you wish to use. +For example, passing a `Float32` array will result in the search using +32-bit precision everywhere in the codebase: + +```julia +X = 2randn(Float32, 5, 1000) +y = @. 2*cos(X[4, :]) + X[1, :]^2 - 2 + +options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos]) +hof = EquationSearch(X, y; options=options, niterations=30) +``` + +we can see that the output types are `Float32`: + +```julia +best, df = get_best(; X, y, hof, options) +println(typeof(best)) +# Node{Float32} +``` + +We can also use `Complex` numbers: + +```julia +cos_re(x::Complex{T}) where {T} = cos(abs(x)) + 0im + +X = 15 .* rand(ComplexF64, 5, 1000) .- 7.5 +y = @. 2*cos_re((2+1im) * X[4, :]) + 0.1 * X[1, :]^2 - 2 + +options = Options(; binary_operators=[+, -, *, /], unary_operators=[cos_re], maxsize=30) +hof = EquationSearch(X, y; options=options, niterations=100) +``` + +## 6. Additional features + +For the many other features available in SymbolicRegression.jl, +check out the API page for `Options`. You might also find it useful +to browse the documentation for the Python frontend +[PySR](http://astroautomata.com/PySR), which has additional documentation. +In particular, the [tuning page](http://astroautomata.com/PySR/tuning) +is useful for improving search performance. \ No newline at end of file diff --git a/src/HallOfFame.jl b/src/HallOfFame.jl index e33284c0f..396c6661c 100644 --- a/src/HallOfFame.jl +++ b/src/HallOfFame.jl @@ -7,12 +7,22 @@ import ..PopMemberModule: PopMember, copy_pop_member import ..LossFunctionsModule: eval_loss using Printf: @sprintf -""" List of the best members seen all time in `.members` """ +""" +HallOfFame{T<:DATA_TYPE,L<:LOSS_TYPE} + +List of the best members seen all time in `.members`, with `.members[c]` being +the best member seen at complexity c. Including only the members which actually +have been set, you can run `.members[exists]`. + +# Fields + +- `members::Array{PopMember{T,L},1}`: List of the best members seen all time. + These are ordered by complexity, with `.members[1]` the member with complexity 1. +- `exists::Array{Bool,1}`: Whether the member at the given complexity has been set. +""" mutable struct HallOfFame{T<:DATA_TYPE,L<:LOSS_TYPE} members::Array{PopMember{T,L},1} exists::Array{Bool,1} #Whether it has been set - - # Arranged by complexity - store one at each. end """ @@ -27,6 +37,7 @@ has been instantiated or not. Arguments: - `options`: Options containing specification about deterministic. - `T`: Type of Nodes to use in the population. e.g., `Float64`. +- `L`: Type of loss to use in the population. e.g., `Float64`. """ function HallOfFame( options::Options, ::Type{T}, ::Type{L} diff --git a/src/PopMember.jl b/src/PopMember.jl index 421f30075..06084caee 100644 --- a/src/PopMember.jl +++ b/src/PopMember.jl @@ -23,12 +23,14 @@ generate_reference() = abs(rand(Int)) PopMember(t::Node{T}, score::L, loss::L) Create a population member with a birth date at the current time. +The type of the `Node` may be different from the type of the score +and loss. # Arguments -- `t::Node`: The tree for the population member. -- `score::T`: The score (normalized to a baseline, and offset by a complexity penalty) -- `loss::T`: The raw loss to assign. +- `t::Node{T}`: The tree for the population member. +- `score::L`: The score (normalized to a baseline, and offset by a complexity penalty) +- `loss::L`: The raw loss to assign. """ function PopMember( t::Node{T}, score::L, loss::L; ref::Int=-1, parent::Int=-1, deterministic=false @@ -42,16 +44,16 @@ function PopMember( end """ - PopMember(dataset::Dataset{T}, - t::Node, options::Options) + PopMember(dataset::Dataset{T,L}, + t::Node{T}, options::Options) Create a population member with a birth date at the current time. Automatically compute the score for this tree. # Arguments -- `dataset::Dataset{T}`: The dataset to evaluate the tree on. -- `t::Node`: The tree for the population member. +- `dataset::Dataset{T,L}`: The dataset to evaluate the tree on. +- `t::Node{T}`: The tree for the population member. - `options::Options`: What options to use. """ function PopMember( diff --git a/src/Population.jl b/src/Population.jl index 0eeb8fb36..13132f2e6 100644 --- a/src/Population.jl +++ b/src/Population.jl @@ -16,7 +16,7 @@ mutable struct Population{T<:DATA_TYPE,L<:LOSS_TYPE} n::Int end """ - Population(pop::Array{PopMember{T}, 1}) + Population(pop::Array{PopMember{T,L}, 1}) Create population from list of PopMembers. """ @@ -27,7 +27,7 @@ function Population( end """ - Population(dataset::Dataset{T}; + Population(dataset::Dataset{T,L}; npop::Int, nlength::Int=3, options::Options, nfeatures::Int) From 9467bf9f77145053c0d05ae0f054b9f6533de0d5 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 22:29:19 -0400 Subject: [PATCH 15/19] Create set of examples matching PySR docs --- docs/src/examples.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/src/examples.md b/docs/src/examples.md index 6698e3de7..d38d41594 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -154,4 +154,5 @@ check out the API page for `Options`. You might also find it useful to browse the documentation for the Python frontend [PySR](http://astroautomata.com/PySR), which has additional documentation. In particular, the [tuning page](http://astroautomata.com/PySR/tuning) -is useful for improving search performance. \ No newline at end of file +is useful for improving search performance. + From a2035524fffd2e3648ad73edc2c4352c5ffdb159 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 23:13:09 -0400 Subject: [PATCH 16/19] Fix constant perturbation type conversion --- src/MutationFunctions.jl | 8 ++++---- src/SingleIteration.jl | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/MutationFunctions.jl b/src/MutationFunctions.jl index 50931f4a9..e43e1b81a 100644 --- a/src/MutationFunctions.jl +++ b/src/MutationFunctions.jl @@ -48,8 +48,8 @@ end # Randomly perturb a constant function mutate_constant( - tree::Node{T}, temperature::L, options::Options -)::Node{T} where {T<:DATA_TYPE,L<:DATA_TYPE} + tree::Node{T}, temperature, options::Options +)::Node{T} where {T<:DATA_TYPE} # T is between 0 and 1. if !(has_constants(tree)) @@ -61,8 +61,8 @@ function mutate_constant( end bottom = 1//10 - maxChange = L(options.perturbation_factor) * temperature + L(1 + bottom) - factor = maxChange^rand(T) + maxChange = options.perturbation_factor * temperature + 1 + bottom + factor = T(maxChange^rand(T)) makeConstBigger = rand() > 0.5 if makeConstBigger diff --git a/src/SingleIteration.jl b/src/SingleIteration.jl index 95f8c5ef1..00acdaa1c 100644 --- a/src/SingleIteration.jl +++ b/src/SingleIteration.jl @@ -24,8 +24,8 @@ function s_r_cycle( options::Options, record::RecordType, )::Tuple{Population{T,L},HallOfFame{T,L},Float64} where {T<:DATA_TYPE,L<:LOSS_TYPE} - max_temp = L(1.0) - min_temp = L(0.0) + max_temp = 1.0 + min_temp = 0.0 if !options.annealing min_temp = max_temp end From b198af03859b7f0d6afb64d1f7ec9f12ee745663 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Sun, 19 Mar 2023 23:16:50 -0400 Subject: [PATCH 17/19] Clean up population member copy --- src/PopMember.jl | 4 +++- src/Population.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/PopMember.jl b/src/PopMember.jl index 06084caee..4b1994067 100644 --- a/src/PopMember.jl +++ b/src/PopMember.jl @@ -80,7 +80,9 @@ function copy_pop_member( return PopMember{T,L}(tree, score, loss, birth, ref, parent) end -function copy_pop_member_reset_birth(p::P; deterministic::Bool)::P where {P<:PopMember} +function copy_pop_member_reset_birth( + p::PopMember{T,L}; deterministic::Bool +)::PopMember{T,L} where {T<:DATA_TYPE,L<:LOSS_TYPE} new_member = copy_pop_member(p) new_member.birth = get_birth_order(; deterministic=deterministic) return new_member diff --git a/src/Population.jl b/src/Population.jl index 13132f2e6..737641e21 100644 --- a/src/Population.jl +++ b/src/Population.jl @@ -155,7 +155,7 @@ function finalize_scores( end # Return best 10 examples -function best_sub_pop(pop::Population; topn::Int=10) +function best_sub_pop(pop::P; topn::Int=10)::P where {P<:Population} best_idx = sortperm([pop.members[member].score for member in 1:(pop.n)]) return Population(pop.members[best_idx[1:topn]]) end From e410b9171da073338dd3c91bdc191d18d40d41b1 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 20 Mar 2023 17:50:20 -0400 Subject: [PATCH 18/19] Fix num evals calculation in `finalize_scores` --- src/Population.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Population.jl b/src/Population.jl index 737641e21..f328278e2 100644 --- a/src/Population.jl +++ b/src/Population.jl @@ -149,7 +149,7 @@ function finalize_scores( pop.members[member].score = score pop.members[member].loss = loss end - num_evals += pop.n * (options.batch_size / dataset.n) + num_evals += pop.n end return (pop, num_evals) end From 7f48778bbd94414e1914be690a5f78f73ee9fcb0 Mon Sep 17 00:00:00 2001 From: MilesCranmer Date: Mon, 20 Mar 2023 18:25:27 -0400 Subject: [PATCH 19/19] Specify order of pages in docs --- docs/make.jl | 21 ++++++++++++++++++--- docs/src/index_base.md | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index bd316d9c3..f3b8780dc 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,8 +2,12 @@ using Documenter using SymbolicRegression using SymbolicRegression: Dataset, update_baseline_loss! -DocMeta.setdocmeta!(SymbolicRegression, :DocTestSetup, :(using LossFunctions); recursive=true) -DocMeta.setdocmeta!(SymbolicRegression, :DocTestSetup, :(using DynamicExpressions); recursive=true) +DocMeta.setdocmeta!( + SymbolicRegression, :DocTestSetup, :(using LossFunctions); recursive=true +) +DocMeta.setdocmeta!( + SymbolicRegression, :DocTestSetup, :(using DynamicExpressions); recursive=true +) readme = open(dirname(@__FILE__) * "/../README.md") do io read(io, String) @@ -17,7 +21,10 @@ readme = replace(readme, r"<[/]?div.*" => s"") # Then, we surround ```mermaid\n...\n``` snippets # with ```@raw html\n
\n...\n
```: -readme = replace(readme, r"```mermaid([^`]*)```" => s"```@raw html\n
\n\1\n
\n```") +readme = replace( + readme, + r"```mermaid([^`]*)```" => s"```@raw html\n
\n\1\n
\n```", +) # Then, we init mermaid.js: init_mermaid = """ @@ -51,6 +58,14 @@ makedocs(; format=Documenter.HTML(; canonical="https://astroautomata.com/SymbolicRegression.jl/stable" ), + pages=[ + "Contents" => "index_base.md", + "Home" => "index.md", + "Examples" => "examples.md", + "API" => "api.md", + "Losses" => "losses.md", + "Types" => "types.md", + ], ) deploydocs(; repo="github.com/MilesCranmer/SymbolicRegression.jl.git") diff --git a/docs/src/index_base.md b/docs/src/index_base.md index b36d9671d..c576ec5e9 100644 --- a/docs/src/index_base.md +++ b/docs/src/index_base.md @@ -2,5 +2,5 @@ # Contents ```@contents -Pages = ["api.md", "types.md", "losses.md"] +Pages = ["examples.md", "api.md", "types.md", "losses.md"] ```